diff --git a/.gitattributes b/.gitattributes index 40a26623..8451e4c4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,5 +1,6 @@ # Mark generated files as binary, ignore them in git diff. # dnn +dnn/scripts/cutlass_generator/list.bzl binary dnn/src/cuda/conv_bias/int4/kimpl/* binary dnn/src/cuda/conv_bias/int8/kimpl/* binary dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary diff --git a/dnn/scripts/cutlass_generator/BUILD b/dnn/scripts/cutlass_generator/BUILD new file mode 100644 index 00000000..4c02fb51 --- /dev/null +++ b/dnn/scripts/cutlass_generator/BUILD @@ -0,0 +1,18 @@ +load("list.bzl", "cutlass_gen_list") + +genrule( + name = "cutlass_kimpls", + outs = cutlass_gen_list, + cmd = """GEN=$(location //brain/megbrain/dnn/scripts/cutlass_generator:generator.py) + pwd > /tmp/a + echo $(@D) > /tmp/b + python3 $$GEN --operations gemm --type simt $(@D) + python3 $$GEN --operations gemv --type simt $(@D) + python3 $$GEN --operations deconv --type simt $(@D) + python3 $$GEN --operations conv2d --type simt $(@D) + python3 $$GEN --operations conv2d --type tensorop8816 $(@D) + python3 $$GEN --operations conv2d --type tensorop8832 $(@D) + """, + tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"], + visibility = ["//visibility:public"], +) diff --git a/dnn/scripts/cutlass_generator/README.md b/dnn/scripts/cutlass_generator/README.md new file mode 100644 index 00000000..3b1e6505 --- /dev/null +++ b/dnn/scripts/cutlass_generator/README.md @@ -0,0 +1,19 @@ +# Generate device kernel registration code for CUTLASS kernels +## Usage +```bash +python3 generator.py [--operations {gemm, gemv, conv2d, deconv}] [--type {simt, tensorop8816, tensorop8832}] + output +``` +- operations: operation kind, including gemm|gemv|conv2d|deconv +- type: opcode class, simt|tensorop8816|tensorop8832 +- output: the output directory for CUTLASS kernels + +## Generate file list for bazel + +We generate `list.bzl` because the `genrule` method of bazel requires that the output file list be specified in the analysis phase. + +Please call `gen_list.py` when new operations are added. + +```bash +python3 gen_list.py +``` diff --git a/dnn/scripts/cutlass_generator/conv2d_operation.py b/dnn/scripts/cutlass_generator/conv2d_operation.py new file mode 100644 index 00000000..6cd18047 --- /dev/null +++ b/dnn/scripts/cutlass_generator/conv2d_operation.py @@ -0,0 +1,614 @@ +# +# \file generator.py +# +# \brief Generates the CUTLASS Library's instances +# +# + +import enum +import os.path +import shutil +from typing import Tuple, List + +from lazy_file import LazyFile +from library import * + +################################################################################################### + +# +class Conv2dOperation: + # + def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ + epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ + need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNt): + + self.operation_kind = OperationKind.Conv2d + self.conv_kind = conv_kind + self.arch = arch + self.tile_description = tile_description + self.conv_type = conv_type + self.src = src + self.flt = flt + self.bias = bias + self.dst = dst + self.element_epilogue = element_epilogue + self.epilogue_functor = epilogue_functor + self.swizzling_functor = swizzling_functor + self.need_load_from_const = need_load_from_const + self.implicit_gemm_mode = implicit_gemm_mode + # + def accumulator_type(self): + accum = self.tile_description.math_instruction.element_accumulator + + return accum + + # + def core_name(self): + ''' The basic operation kind is prefixed with a letter indicating the accumulation type. ''' + + intermediate_type = '' + + if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp: + inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape) + if self.tile_description.math_instruction.element_a != self.flt.element and \ + self.tile_description.math_instruction.element_a != self.accumulator_type(): + intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a] + else: + inst_shape = '' + + unity_kernel = '' + if not self.need_load_from_const: + unity_kernel = '_1x1' + + return "%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \ + inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \ + ShortEpilogueNames[self.epilogue_functor]) + + # + def extended_name(self): + if self.dst.element != self.tile_description.math_instruction.element_accumulator: + if self.src.element != self.flt.element: + extended_name = "${element_dst}_${core_name}_${element_src}_${element_flt}" + elif self.src.element == self.flt.element: + extended_name = "${element_dst}_${core_name}_${element_src}" + else: + if self.src.element != self.flt.element: + extended_name = "${core_name}_${element_src}_${element_flt}" + elif self.src.element == self.flt.element: + extended_name = "${core_name}_${element_src}" + + extended_name = SubstituteTemplate(extended_name, { + 'element_src': DataTypeNames[self.src.element], + 'element_flt': DataTypeNames[self.flt.element], + 'element_dst': DataTypeNames[self.dst.element], + 'core_name': self.core_name() + }) + + return extended_name + + # + def layout_name(self): + if self.src.layout == self.dst.layout: + layout_name = "${src_layout}_${flt_layout}" + else: + layout_name = "${src_layout}_${flt_layout}_${dst_layout}" + + layout_name = SubstituteTemplate(layout_name, { + 'src_layout': ShortLayoutTypeNames[self.src.layout], + 'flt_layout': ShortLayoutTypeNames[self.flt.layout], + 'dst_layout': ShortLayoutTypeNames[self.dst.layout], + }) + + return layout_name + +# + def configuration_name(self): + ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' + + opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class] + + warp_shape = [int(self.tile_description.threadblock_shape[idx] / self.tile_description.warp_count[idx]) for idx in range(3)] + + + threadblock = "%dx%dx%d_%dx%dx%d_%d" % ( + self.tile_description.threadblock_shape[0], + self.tile_description.threadblock_shape[1], + self.tile_description.threadblock_shape[2], + warp_shape[0], + warp_shape[1], + warp_shape[2], + self.tile_description.stages, + ) + + configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}" + + return SubstituteTemplate( + configuration_name, + { + 'opcode_class': opcode_class_name, + 'extended_name': self.extended_name(), + 'threadblock': threadblock, + 'layout': self.layout_name(), + } + ) + + # + def procedural_name(self): + ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' + return self.configuration_name() + +################################################################################################### +# +# Emits single instances of a CUTLASS device-wide operator +# +################################################################################################### + +class EmitConv2dInstance: + def __init__(self): + self.template = """ +// kernel instance "${operation_name}" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + ${element_src}, + ${layout_src}, + ${element_flt}, + ${layout_flt}, + ${element_dst}, + ${layout_dst}, + ${element_bias}, + ${layout_bias}, + ${element_accumulator}, + ${conv_type}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_dst}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_bias}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${alignment_src}, + ${alignment_filter}, + ${nonuninity_kernel}, + ${math_operator}, + ${implicit_gemm_mode}>; +""" + + + def emit(self, operation): + + warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)] + + epilogue_vector_length = int(min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) / DataTypeSize[operation.dst.element]) + + values = { + 'operation_name': operation.procedural_name(), + 'conv_type': ConvTypeTag[operation.conv_type], + 'element_src': DataTypeTag[operation.src.element], + 'layout_src': LayoutTag[operation.src.layout], + 'element_flt': DataTypeTag[operation.flt.element], + 'layout_flt': LayoutTag[operation.flt.layout], + 'element_dst': DataTypeTag[operation.dst.element], + 'layout_dst': LayoutTag[operation.dst.layout], + 'element_bias': DataTypeTag[operation.bias.element], + 'layout_bias': LayoutTag[operation.bias.layout], + 'element_accumulator': DataTypeTag[operation.accumulator_type()], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'epilogue_vector_length': str(epilogue_vector_length), + 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], + 'stages': str(operation.tile_description.stages), + 'alignment_src': str(operation.src.alignment), + 'alignment_filter': str(operation.flt.alignment), + 'nonuninity_kernel': str(operation.need_load_from_const).lower(), + 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], + 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] + } + + return SubstituteTemplate(self.template, values) + +class EmitDeconvInstance: + def __init__(self): + self.template = """ +// kernel instance "${operation_name}" generated by cutlass generator +using Deconvolution = + typename cutlass::conv::device::Deconvolution< + ${element_src}, + ${layout_src}, + ${element_flt}, + ${layout_flt}, + ${element_dst}, + ${layout_dst}, + ${element_bias}, + ${layout_bias}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_dst}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_bias}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${alignment_src}, + ${alignment_filter}, + ${nonuninity_kernel}, + ${math_operator}, + ${implicit_gemm_mode}>; +""" + + + def emit(self, operation): + + warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)] + + epilogue_vector_length = int(min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) / DataTypeSize[operation.dst.element]) + + values = { + 'operation_name': operation.procedural_name(), + 'element_src': DataTypeTag[operation.src.element], + 'layout_src': LayoutTag[operation.src.layout], + 'element_flt': DataTypeTag[operation.flt.element], + 'layout_flt': LayoutTag[operation.flt.layout], + 'element_dst': DataTypeTag[operation.dst.element], + 'layout_dst': LayoutTag[operation.dst.layout], + 'element_bias': DataTypeTag[operation.bias.element], + 'layout_bias': LayoutTag[operation.bias.layout], + 'element_accumulator': DataTypeTag[operation.accumulator_type()], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'epilogue_vector_length': str(epilogue_vector_length), + 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], + 'stages': str(operation.tile_description.stages), + 'alignment_src': str(operation.src.alignment), + 'alignment_filter': str(operation.flt.alignment), + 'nonuninity_kernel': str(operation.need_load_from_const).lower(), + 'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], + 'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] + } + + return SubstituteTemplate(self.template, values) + + +################################################################################################### +# +# Generator functions for all layouts +# +################################################################################################### + +# +def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ + skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNt): + operations = [] + + element_epilogue = DataType.f32 + if conv_kind == ConvKind.Fprop: + if src_layout == LayoutType.TensorNHWC: + swizzling_functor = SwizzlingFunctor.ConvFpropNHWC + else: + swizzling_functor = SwizzlingFunctor.ConvFpropNCxHWx + else: + swizzling_functor = SwizzlingFunctor.ConvDgradNCxHWx + + # skip rule + def filter_tile_with_layout(tile: TileDescription, layout: LayoutType) -> bool: + return layout == LayoutType.TensorNC32HW32 and \ + tile.threadblock_shape[0] % 32 != 0 + + # rule for bias_type and epilogues + def get_bias_type_and_epilogues(tile: TileDescription, \ + out_dtype: DataType) -> Tuple[DataType, List[EpilogueFunctor]]: + if tile.math_instruction.element_accumulator == DataType.s32 and \ + out_dtype != DataType.f32: + bias_type = DataType.s32 + if tile.math_instruction.element_b == DataType.u4: + epilogues = [EpilogueFunctor.BiasAddLinearCombinationClamp, EpilogueFunctor.BiasAddLinearCombinationReluClamp] + else: + epilogues = [EpilogueFunctor.BiasAddLinearCombinationClamp, EpilogueFunctor.BiasAddLinearCombinationReluClamp, \ + EpilogueFunctor.BiasAddLinearCombinationHSwishClamp] + elif tile.math_instruction.element_accumulator == DataType.f32 or \ + out_dtype == DataType.f32: + bias_type = DataType.f32 + epilogues = [EpilogueFunctor.BiasAddLinearCombination, EpilogueFunctor.BiasAddLinearCombinationRelu, \ + EpilogueFunctor.BiasAddLinearCombinationHSwish] + return bias_type, epilogues + + # rule for filter alignment + def get_flt_align(tile: TileDescription) -> int: + nonlocal flt_align + if tile.math_instruction.opcode_class == OpcodeClass.Simt \ + and tile.math_instruction.element_accumulator == DataType.s32: + thread_num = tile.warp_count[0] * tile.warp_count[1] * tile.warp_count[2] * 32 + flt_block = tile.threadblock_shape[0] * tile.threadblock_shape[2] \ + * DataTypeSize[tile.math_instruction.element_a] + load_per_thread = flt_block//thread_num + if load_per_thread >= 128: + flt_align = 128 + elif load_per_thread >= 64: + flt_align = 64 + else: + assert load_per_thread >= 32 + flt_align = 32 + return flt_align + + def get_dst_align(tile: TileDescription, out_layout: LayoutType) -> int: + nonlocal dst_align + if tile.math_instruction.opcode_class == OpcodeClass.TensorOp \ + and dst_layout == LayoutType.TensorNC4HW4: + dst_align = 32 + return dst_align + + def filter_epilogue_with_conv_kind(epilogue: EpilogueFunctor, conv_kind: ConvKind) -> bool: + return conv_kind == ConvKind.Dgrad \ + and epilogue != EpilogueFunctor.BiasAddLinearCombinationClamp + + # loop over all tile descriptions + for tile in tile_descriptions: + if filter_tile_with_layout(tile, dst_layout): + continue + + bias_type, epilogues = get_bias_type_and_epilogues(tile, dst_type) + + flt_align = get_flt_align(tile) + + dst_align = get_dst_align(tile, dst_layout) + + for epilogue in epilogues: + if filter_epilogue_with_conv_kind(epilogue, conv_kind): + continue + + if dst_type == DataType.f32: + bias_type = DataType.f32 + # + src = TensorDescription(tile.math_instruction.element_b, src_layout, int(src_align / DataTypeSize[tile.math_instruction.element_b])) + flt = TensorDescription(tile.math_instruction.element_a, flt_layout, int(flt_align / DataTypeSize[tile.math_instruction.element_a])) + bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) + dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) + + new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode) + operations.append(new_operation) + if not skip_unity_kernel: + new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode) + operations.append(new_operation) + return operations + +################################################################################################### +# +# Emitters functions for all targets +# +################################################################################################### + +class EmitConv2dConfigurationLibrary: + def __init__(self, operation_path, configuration_name): + self.configuration_name = configuration_name + self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name) + + self.instance_emitter = EmitConv2dInstance() + + self.instance_template = """ +${operation_instance} + +// Derived class +struct ${operation_name} : + public ${operation_name}_base { }; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +""" + self.header_template = """ +/* + Generated by conv2d_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "library_internal.h" +#include "conv2d_operation.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// +""" + + self.configuration_header = """ + +namespace cutlass { +namespace library { + +// Initialize all instances +void initialize_${configuration_name}(Manifest &manifest) { + +""" + + self.configuration_instance = """ + using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution< + ${operation_name}>; + + manifest.append(new cutlass::library::Conv2dOperation< + Operation_${operation_name}>( + "${operation_name}")); + +""" + + self.configuration_epilogue = """ +} +""" + self.epilogue_template = """ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +""" + + # + def __enter__(self): + self.configuration_file = open(self.configuration_path, "w") + self.configuration_file.write(SubstituteTemplate(self.header_template, { + 'configuration_name': self.configuration_name + })) + self.operations = [] + return self + + # + def emit(self, operation): + self.operations.append(operation) + self.configuration_file.write(SubstituteTemplate(self.instance_template, { + 'configuration_name': self.configuration_name, + 'operation_name': operation.procedural_name(), + 'operation_instance': self.instance_emitter.emit(operation) + })) + + # + def __exit__(self, exception_type, exception_value, traceback): + + self.configuration_file.write(SubstituteTemplate(self.configuration_header, { + 'configuration_name': self.configuration_name + })) + + for operation in self.operations: + self.configuration_file.write(SubstituteTemplate(self.configuration_instance, { + 'configuration_name': self.configuration_name, + 'operation_name': operation.procedural_name() + })) + + self.configuration_file.write(self.configuration_epilogue) + self.configuration_file.write(self.epilogue_template) + self.configuration_file.close() + +################################################################################################### +################################################################################################### + +# Emitters for Conv Kernel Wrapper +# +################################################################################################### + +class EmitConvSingleKernelWrapper(): + def __init__(self, kernel_path, operation, wrapper_path): + self.kernel_path = kernel_path + self.wrapper_path = wrapper_path + self.operation = operation + + self.conv_wrappers = { \ + ConvKind.Fprop: """ +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +""", \ + ConvKind.Dgrad: """ +template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( + const typename Deconvolution::ElementSrc* d_src, + const typename Deconvolution::ElementFilter* d_filter, + const typename Deconvolution::ElementBias* d_bias, + const typename Deconvolution::ElementDst* d_z, + typename Deconvolution::ElementDst* d_dst, + int* workspace, + typename Deconvolution::ConvolutionParameter const& conv_param, + typename Deconvolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +""", \ + } + + if self.operation.conv_kind == ConvKind.Fprop: + self.instance_emitter = EmitConv2dInstance() + else: + assert self.operation.conv_kind == ConvKind.Dgrad + self.instance_emitter = EmitDeconvInstance() + + self.header_template = """ +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "${wrapper_path}" +""" + self.instance_template = """ +${operation_instance} +""" + self.wrapper_template = """ +${wrapper_instance} +""" + + self.epilogue_template = """ +#pragma GCC diagnostic pop +#endif +""" + + # + def __enter__(self): + self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) + self.kernel_file = LazyFile(self.kernel_path) + self.kernel_file.write(SubstituteTemplate(self.header_template, { + 'wrapper_path': self.wrapper_path, + })) + return self + + # + def emit(self): + self.kernel_file.write(SubstituteTemplate(self.instance_template, { + 'operation_instance': self.instance_emitter.emit(self.operation), + })) + + # emit wrapper + wrapper = SubstituteTemplate(self.wrapper_template, { + 'wrapper_instance': self.conv_wrappers[self.operation.conv_kind], + }) + self.kernel_file.write(wrapper) + + # + def __exit__(self, exception_type, exception_value, traceback): + self.kernel_file.write(self.epilogue_template) + self.kernel_file.close() + + +################################################################################################### +################################################################################################### + diff --git a/dnn/scripts/cutlass_generator/gemm_operation.py b/dnn/scripts/cutlass_generator/gemm_operation.py new file mode 100644 index 00000000..3cd28715 --- /dev/null +++ b/dnn/scripts/cutlass_generator/gemm_operation.py @@ -0,0 +1,1085 @@ +# +# \file generator.py +# +# \brief Generates the CUTLASS Library's instances +# + +import enum +import os.path +import shutil +import functools +import operator + +from lazy_file import LazyFile +from library import * + + +################################################################################################### +# +# Data structure modeling a GEMM operation +# +################################################################################################### + +# +class GemmOperation: + # + def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \ + epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8): + + self.operation_kind = OperationKind.Gemm + self.arch = arch + self.tile_description = tile_description + self.gemm_kind = gemm_kind + self.A = A + self.B = B + self.C = C + self.element_epilogue = element_epilogue + self.epilogue_functor = epilogue_functor + self.swizzling_functor = swizzling_functor + + # + def is_complex(self): + complex_operators = [ + MathOperation.multiply_add_complex, + MathOperation.multiply_add_complex_gaussian + ] + return self.tile_description.math_instruction.math_operation in complex_operators + + # + def is_split_k_parallel(self): + return self.gemm_kind == GemmKind.SplitKParallel + + # + def is_planar_complex(self): + return self.gemm_kind in (GemmKind.PlanarComplex, GemmKind.PlanarComplexArray) + + # + def accumulator_type(self): + accum = self.tile_description.math_instruction.element_accumulator + + if self.is_complex(): + return get_complex_from_real(accum) + + return accum + + # + def short_math_name(self): + if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian: + return "g%s" % ShortDataTypeNames[self.accumulator_type()] + return ShortDataTypeNames[self.accumulator_type()] + + + # + def core_name(self): + ''' The basic operation kind is prefixed with a letter indicating the accumulation type. ''' + + inst_shape = '' + inst_operation = '' + intermediate_type = '' + + math_operations_map = { + MathOperation.xor_popc: 'xor', + } + + if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp or \ + self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp: + + math_op = self.tile_description.math_instruction.math_operation + math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else '' + + inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape) + inst_shape += math_op_string + + if self.tile_description.math_instruction.element_a != self.A.element and \ + self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator: + intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a] + + return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, GemmKindNames[self.gemm_kind]) + + # + def extended_name(self): + ''' Append data types if they differ from compute type. ''' + if self.is_complex(): + extended_name = "${core_name}" + else: + if self.C.element != self.tile_description.math_instruction.element_accumulator and \ + self.A.element != self.tile_description.math_instruction.element_accumulator: + extended_name = "${element_c}_${core_name}_${element_a}" + elif self.C.element == self.tile_description.math_instruction.element_accumulator and \ + self.A.element != self.tile_description.math_instruction.element_accumulator: + extended_name = "${core_name}_${element_a}" + else: + extended_name = "${core_name}" + + extended_name = SubstituteTemplate(extended_name, { + 'element_a': DataTypeNames[self.A.element], + 'element_c': DataTypeNames[self.C.element], + 'core_name': self.core_name() + }) + + return extended_name + + # + def layout_name(self): + if self.is_complex() or self.is_planar_complex(): + return "%s%s" % ( + ShortComplexLayoutNames[(self.A.layout, self.A.complex_transform)], + ShortComplexLayoutNames[(self.B.layout, self.B.complex_transform)] + ) + return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout]) + + # + def procedural_name(self): + ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' + threadblock = self.tile_description.procedural_name() + + opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class] + + alignment = max([self.A.alignment, self.B.alignment, self.C.alignment]) + + return SubstituteTemplate( + "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}", + { + 'opcode_class': opcode_class_name, + 'extended_name': self.extended_name(), + 'threadblock': threadblock, + 'layout': self.layout_name(), + 'alignment': "%d" % self.A.alignment, + } + ) + + # + def configuration_name(self): + ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' + return self.procedural_name() + +################################################################################################### +# +# Data structure modeling a GEMV Batched Strided operation +# +################################################################################################### + +# +class GemvBatchedStridedOperation: + # + def __init__(self, gemm_kind, arch, math_inst, threadblock_shape, thread_shape, A, B, C): + + self.operation_kind = OperationKind.Gemm + self.arch = arch + self.gemm_kind = gemm_kind + self.math_instruction = math_inst + self.threadblock_shape = threadblock_shape + self.thread_shape = thread_shape + self.A = A + self.B = B + self.C = C + + # + def accumulator_type(self): + accum = self.math_instruction.element_accumulator + + return accum + + # + def short_math_name(self): + return ShortDataTypeNames[self.accumulator_type()] + + + # + def core_name(self): + ''' The basic operation kind is prefixed with a letter indicating the accumulation type. ''' + + return "%s%s" % (self.short_math_name(), \ + GemmKindNames[self.gemm_kind]) + + # + def extended_name(self): + ''' Append data types if they differ from compute type. ''' + if self.C.element != self.math_instruction.element_accumulator and \ + self.A.element != self.math_instruction.element_accumulator: + extended_name = "${element_c}_${core_name}_${element_a}" + elif self.C.element == self.math_instruction.element_accumulator and \ + self.A.element != self.math_instruction.element_accumulator: + extended_name = "${core_name}_${element_a}" + else: + extended_name = "${core_name}" + + extended_name = SubstituteTemplate(extended_name, { + 'element_a': DataTypeNames[self.A.element], + 'element_c': DataTypeNames[self.C.element], + 'core_name': self.core_name() + }) + + return extended_name + + # + def layout_name(self): + return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout]) + + # + def procedural_name(self): + ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' + threadblock = "%dx%d_%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2]) + + opcode_class_name = OpcodeClassNames[self.math_instruction.opcode_class] + + alignment_a = self.A.alignment + alignment_b = self.B.alignment + + return SubstituteTemplate( + "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment_a}x${alignment_b}", + { + 'opcode_class': opcode_class_name, + 'extended_name': self.extended_name(), + 'threadblock': threadblock, + 'layout': self.layout_name(), + 'alignment_a': "%d" % alignment_a, + 'alignment_b': "%d" % alignment_b, + } + ) + + # + def configuration_name(self): + ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' + return self.procedural_name() + +# +def GeneratesGemm(tile, data_type, layout_a, layout_b, layout_c, min_cc, align_a = 32, align_b = 32, align_c = 32): + operations = [] + swizzling_functor = SwizzlingFunctor.Identity1 + + element_a, element_b, element_c, element_epilogue = data_type + + if tile.math_instruction.element_accumulator == DataType.s32: + epilogues = [EpilogueFunctor.LinearCombinationClamp] + else: + assert tile.math_instruction.element_accumulator == DataType.f32 + epilogues = [EpilogueFunctor.LinearCombination] + + for epilogue in epilogues: + A = TensorDescription(element_a, layout_a, int(align_a//DataTypeSize[element_a])) + B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b])) + C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c])) + operations.append(GemmOperation(GemmKind.Gemm, min_cc, tile, A, B, C, \ + element_epilogue, epilogue, swizzling_functor)) + operations.append(GemmOperation(GemmKind.SplitKParallel, min_cc, tile, A, B, C, \ + element_epilogue, epilogue, swizzling_functor)) + return operations + +def GeneratesGemv(math_inst, threadblock_shape, thread_shape, data_type, layout_a, layout_b, layout_c, min_cc, \ + align_a = 32, align_b = 32, align_c = 32): + element_a, element_b, element_c, element_epilogue = data_type + + A = TensorDescription(element_a, layout_a, int(align_a//DataTypeSize[element_a])) + B = TensorDescription(element_b, layout_b, int(align_b//DataTypeSize[element_b])) + C = TensorDescription(element_c, layout_c, int(align_c//DataTypeSize[element_c])) + return GemvBatchedStridedOperation(GemmKind.GemvBatchedStrided, min_cc, math_inst, threadblock_shape, thread_shape, \ + A, B, C) + +################################################################################################### +# +# Emits single instances of a CUTLASS device-wide operator +# +################################################################################################### + +# +class EmitGemmInstance: + ''' Responsible for emitting a CUTLASS template definition''' + + def __init__(self): + self.gemm_template = """ + // Gemm operator ${operation_name} + using Operation_${operation_name} = cutlass::gemm::device::Gemm< + ${element_a}, ${layout_a}, + ${element_b}, ${layout_b}, + ${element_c}, ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${align_a}, + ${align_b}, + false, + ${math_operation} + ${residual} + >; +""" + self.gemm_complex_template = """ + // Gemm operator ${operation_name} + using Operation_${operation_name} = cutlass::gemm::device::GemmComplex< + ${element_a}, ${layout_a}, + ${element_b}, ${layout_b}, + ${element_c}, ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${transform_a}, + ${transform_b}, + ${math_operation} + ${residual} + >; +""" + + def emit(self, operation): + + warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)] + + epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element]) + + residual = '' + + values = { + 'operation_name': operation.procedural_name(), + 'element_a': DataTypeTag[operation.A.element], + 'layout_a': LayoutTag[operation.A.layout], + 'element_b': DataTypeTag[operation.B.element], + 'layout_b': LayoutTag[operation.B.layout], + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[operation.C.layout], + 'element_accumulator': DataTypeTag[operation.accumulator_type()], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'epilogue_vector_length': str(epilogue_vector_length), + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], + 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], + 'stages': str(operation.tile_description.stages), + 'align_a': str(operation.A.alignment), + 'align_b': str(operation.B.alignment), + 'transform_a': ComplexTransformTag[operation.A.complex_transform], + 'transform_b': ComplexTransformTag[operation.B.complex_transform], + 'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation], + 'residual': residual + } + + template = self.gemm_complex_template if operation.is_complex() else self.gemm_template + + return SubstituteTemplate(template, values) + +# +class EmitGemvBatchedStridedInstance: + ''' Responsible for emitting a CUTLASS template definition''' + + def __init__(self): + self.template = """ + // Gemm operator ${operation_name} + using Operation_${operation_name} = cutlass::gemm::kernel::DefaultGemv< + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${thread_shape_m}, ${thread_shape_n}, ${thread_shape_k}>, + ${element_a}, ${layout_a}, + ${element_b}, ${layout_b}, + ${element_c}, ${layout_c} + >; +""" + + def emit(self, operation): + + values = { + 'operation_name': operation.procedural_name(), + 'element_a': DataTypeTag[operation.A.element], + 'layout_a': LayoutTag[operation.A.layout], + 'element_b': DataTypeTag[operation.B.element], + 'layout_b': LayoutTag[operation.B.layout], + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[operation.C.layout], + 'threadblock_shape_m': str(operation.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.threadblock_shape[2]), + 'thread_shape_m': str(operation.thread_shape[0]), + 'thread_shape_n': str(operation.thread_shape[1]), + 'thread_shape_k': str(operation.thread_shape[2]), + } + + return SubstituteTemplate(self.template, values) + + +################################################################################################### + +class EmitSparseGemmInstance: + ''' Responsible for emitting a CUTLASS template definition''' + + def __init__(self): + self.gemm_template = """ + // Gemm operator ${operation_name} + using Operation_${operation_name} = cutlass::gemm::device::SparseGemm< + ${element_a}, ${layout_a}, + ${element_b}, ${layout_b}, + ${element_c}, ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${align_a}, + ${align_b}, + false, + ${math_operation} + ${residual} + >; +""" + + def emit(self, operation): + + warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)] + + epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element]) + + residual = '' + + values = { + 'operation_name': operation.procedural_name(), + 'element_a': DataTypeTag[operation.A.element], + 'layout_a': LayoutTag[operation.A.layout], + 'element_b': DataTypeTag[operation.B.element], + 'layout_b': LayoutTag[operation.B.layout], + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[operation.C.layout], + 'element_accumulator': DataTypeTag[operation.accumulator_type()], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'epilogue_vector_length': str(epilogue_vector_length), + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], + 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], + 'stages': str(operation.tile_description.stages), + 'align_a': str(operation.A.alignment), + 'align_b': str(operation.B.alignment), + 'transform_a': ComplexTransformTag[operation.A.complex_transform], + 'transform_b': ComplexTransformTag[operation.B.complex_transform], + 'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation], + 'residual': residual + } + + template = self.gemm_template + + return SubstituteTemplate(template, values) + +################################################################################################### + + +# +class EmitGemmUniversalInstance: + ''' Responsible for emitting a CUTLASS template definition''' + + def __init__(self): + self.gemm_template = """ +// Gemm operator ${operation_name} +using ${operation_name}_base = + typename cutlass::gemm::kernel::DefaultGemmUniversal< + ${element_b}, ${layout_b}, ${transform_b}, ${align_b}, // transposed B operand + ${element_a}, ${layout_a}, ${transform_a}, ${align_a}, // transposed A operand + ${element_c}, ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${math_operation} +>::GemmKernel; + +// Define named type +struct ${operation_name} : + public ${operation_name}_base { }; +""" + self.gemm_template_interleaved = """ +// Gemm operator ${operation_name} +using ${operation_name}_base = + typename cutlass::gemm::kernel::DefaultGemmUniversal< + ${element_a}, ${layout_a}, ${transform_a}, ${align_a}, + ${element_b}, ${layout_b}, ${transform_b}, ${align_b}, + ${element_c}, ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${math_operation} +>::GemmKernel; + +// Define named type +struct ${operation_name} : + public ${operation_name}_base { }; +""" + + def emit(self, operation): + + threadblock_shape = operation.tile_description.threadblock_shape + warp_count = operation.tile_description.warp_count + + warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)] + + epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element]) + + transpose_layouts = { + LayoutType.ColumnMajor: LayoutType.RowMajor, + LayoutType.RowMajor: LayoutType.ColumnMajor + } + + if operation.A.layout in transpose_layouts.keys() and \ + operation.B.layout in transpose_layouts.keys() and \ + operation.C.layout in transpose_layouts.keys(): + + instance_layout_A = transpose_layouts[operation.A.layout] + instance_layout_B = transpose_layouts[operation.B.layout] + instance_layout_C = transpose_layouts[operation.C.layout] + + gemm_template = self.gemm_template + else: + instance_layout_A, instance_layout_B, instance_layout_C = \ + (operation.A.layout, operation.B.layout, operation.C.layout) + + gemm_template = self.gemm_template_interleaved + # + + values = { + 'operation_name': operation.procedural_name(), + 'element_a': DataTypeTag[operation.A.element], + 'layout_a': LayoutTag[instance_layout_A], + 'element_b': DataTypeTag[operation.B.element], + 'layout_b': LayoutTag[instance_layout_B], + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[instance_layout_C], + 'element_accumulator': DataTypeTag[operation.accumulator_type()], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'epilogue_vector_length': str(epilogue_vector_length), + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], + 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], + 'stages': str(operation.tile_description.stages), + 'align_a': str(operation.A.alignment), + 'align_b': str(operation.B.alignment), + 'transform_a': ComplexTransformTag[operation.A.complex_transform], + 'transform_b': ComplexTransformTag[operation.B.complex_transform], + 'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation] + } + + return SubstituteTemplate(gemm_template, values) + +################################################################################################### + +# +class EmitGemmPlanarComplexInstance: + ''' Responsible for emitting a CUTLASS template definition''' + + def __init__(self): + self.template = """ + // Gemm operator ${operation_name} + using Operation_${operation_name} = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + ${element_a}, ${layout_a}, ${transform_a}, ${alignment_a}, + ${element_b}, ${layout_b}, ${transform_b}, ${alignment_b}, + ${element_c}, cutlass::layout::RowMajor, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + ${element_c}, + ${alignment_c}, + ${element_accumulator}, + ${element_epilogue} + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + ${stages}, + ${math_operator} + >::GemmKernel; + + struct ${operation_name} : + public Operation_${operation_name} { }; +""" + + def emit(self, operation): + + warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)] + + # exchange and transpose A and B types, layouts, and complex transforms since the C layout is row-major + transposed_layout_A = TransposedLayout[operation.A.layout] + transposed_layout_B = TransposedLayout[operation.B.layout] + + values = { + 'operation_name': operation.procedural_name(), + 'element_a': DataTypeTag[operation.B.element], + 'layout_a': LayoutTag[transposed_layout_B], + 'transform_a': ComplexTransformTag[operation.B.complex_transform], + 'alignment_a': str(operation.B.alignment), + 'element_b': DataTypeTag[operation.A.element], + 'layout_b': LayoutTag[transposed_layout_A], + 'transform_b': ComplexTransformTag[operation.A.complex_transform], + 'alignment_b': str(operation.A.alignment), + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[operation.C.layout], + 'element_accumulator': DataTypeTag[operation.tile_description.math_instruction.element_accumulator], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'alignment_c': str(operation.C.alignment), + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'stages': str(operation.tile_description.stages), + 'math_operator': 'cutlass::arch::OpMultiplyAdd' + } + + return SubstituteTemplate(self.template, values) + +################################################################################################### + +# +class EmitGemmPlanarComplexArrayInstance: + ''' Responsible for emitting a CUTLASS template definition''' + + def __init__(self): + self.template = """ + // Gemm operator ${operation_name} + using Operation_${operation_name} = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + ${element_a}, ${layout_a}, ${transform_a}, ${alignment_a}, + ${element_b}, ${layout_b}, ${transform_b}, ${alignment_b}, + ${element_c}, cutlass::layout::RowMajor, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + ${element_c}, + ${alignment_c}, + ${element_accumulator}, + ${element_epilogue} + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + ${stages}, + ${math_operator} + >::GemmArrayKernel; + + struct ${operation_name} : public Operation_${operation_name} { }; +""" + + def emit(self, operation): + + warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)] + + # exchange and transpose A and B types, layouts, and complex transforms since the C layout is row-major + transposed_layout_A = TransposedLayout[operation.A.layout] + transposed_layout_B = TransposedLayout[operation.B.layout] + + values = { + 'operation_name': operation.procedural_name(), + 'element_a': DataTypeTag[operation.B.element], + 'layout_a': LayoutTag[transposed_layout_B], + 'transform_a': ComplexTransformTag[operation.B.complex_transform], + 'alignment_a': str(operation.B.alignment), + 'element_b': DataTypeTag[operation.A.element], + 'layout_b': LayoutTag[transposed_layout_A], + 'transform_b': ComplexTransformTag[operation.A.complex_transform], + 'alignment_b': str(operation.A.alignment), + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[operation.C.layout], + 'element_accumulator': DataTypeTag[operation.tile_description.math_instruction.element_accumulator], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'alignment_c': str(operation.C.alignment), + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'stages': str(operation.tile_description.stages), + 'math_operator': 'cutlass::arch::OpMultiplyAdd' + } + + return SubstituteTemplate(self.template, values) + +# +class EmitGemmSplitKParallelInstance: + ''' Responsible for emitting a CUTLASS template definition''' + + def __init__(self): + self.template = """ + // Gemm operator ${operation_name} + using Operation_${operation_name} = cutlass::gemm::device::GemmSplitKParallel< + ${element_a}, ${layout_a}, + ${element_b}, ${layout_b}, + ${element_c}, ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + > + >; +""" + def emit(self, operation): + + warp_shape = [operation.tile_description.threadblock_shape[idx] // operation.tile_description.warp_count[idx] for idx in range(3)] + + epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element]) + + values = { + 'operation_name': operation.procedural_name(), + 'element_a': DataTypeTag[operation.A.element], + 'layout_a': LayoutTag[operation.A.layout], + 'element_b': DataTypeTag[operation.B.element], + 'layout_b': LayoutTag[operation.B.layout], + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[operation.C.layout], + 'element_accumulator': DataTypeTag[operation.accumulator_type()], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'epilogue_vector_length': str(epilogue_vector_length), + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], + 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], + } + + return SubstituteTemplate(self.template, values) + + +################################################################################################### + + +################################################################################################### +# +# Emitters functions for all targets +# +################################################################################################### + +class EmitGemmConfigurationLibrary: + def __init__(self, operation_path, configuration_name): + self.configuration_name = configuration_name + self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name).replace('\\', '/') + + self.instance_emitter = { + GemmKind.Gemm: EmitGemmInstance, + GemmKind.Sparse: EmitSparseGemmInstance, + GemmKind.Universal: EmitGemmUniversalInstance, + GemmKind.PlanarComplex: EmitGemmPlanarComplexInstance, + GemmKind.PlanarComplexArray: EmitGemmPlanarComplexArrayInstance + } + + self.gemm_kind_wrappers = { + GemmKind.Gemm: 'GemmOperation', + GemmKind.Sparse: 'GemmSparseOperation', + GemmKind.Universal: 'GemmUniversalOperation', + GemmKind.PlanarComplex: 'GemmPlanarComplexOperation', + GemmKind.PlanarComplexArray: 'GemmPlanarComplexArrayOperation' + } + + self.wmma_guard_start = "#if defined(CUTLASS_ARCH_WMMA_SM${sm_number}_ENABLED)" + + self.instance_template = { + GemmKind.Gemm: """ +${compile_guard_start} + manifest.append(new ${gemm_kind}("${operation_name}")); +${compile_guard_end} +""", + GemmKind.Sparse: """ +${compile_guard_start} + manifest.append(new ${gemm_kind}("${operation_name}")); +${compile_guard_end} +""", + GemmKind.Universal: """ +${compile_guard_start} + manifest.append(new ${gemm_kind}< + cutlass::gemm::device::GemmUniversalAdapter<${operation_name}> + >("${operation_name}")); +${compile_guard_end} +""", + GemmKind.PlanarComplex: """ +${compile_guard_start} + manifest.append(new ${gemm_kind}< + cutlass::gemm::device::GemmUniversalAdapter<${operation_name}> + >("${operation_name}")); +${compile_guard_end} +""", + GemmKind.PlanarComplexArray: """ +${compile_guard_start} + manifest.append(new ${gemm_kind}< + cutlass::gemm::device::GemmUniversalAdapter<${operation_name}> + >("${operation_name}")); +${compile_guard_end} +""" + } + + self.header_template = """ +/* + Generated by gemm_operation.py - Do not edit. +*/ + +/////////////////////////////////////////////////////////////////////////////////////////////////// +#include "cutlass/arch/wmma.h" +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +#include "library_internal.h" +#include "gemm_operation.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +""" + + self.initialize_function_template = """ + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace library { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +void initialize_${configuration_name}(Manifest &manifest) { + +""" + self.epilogue_template = """ + +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +""" + + def __enter__(self): + self.configuration_file = open(self.configuration_path, "w") + self.configuration_file.write(self.header_template) + + self.instance_definitions = [] + self.instance_wrappers = [] + + self.operations = [] + return self + + def emit(self, operation): + emitter = self.instance_emitter[operation.gemm_kind]() + + self.operations.append(operation) + + self.instance_definitions.append(emitter.emit(operation)) + + self.instance_wrappers.append(SubstituteTemplate(self.instance_template[operation.gemm_kind], { + 'configuration_name': self.configuration_name, + 'operation_name': operation.procedural_name(), + 'gemm_kind': self.gemm_kind_wrappers[operation.gemm_kind], + 'compile_guard_start': SubstituteTemplate(self.wmma_guard_start, {'sm_number': str(operation.arch)}) \ + if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "", + 'compile_guard_end': "#endif" \ + if operation.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp else "" + })) + + def __exit__(self, exception_type, exception_value, traceback): + + # Write instance definitions in top-level namespace + for instance_definition in self.instance_definitions: + self.configuration_file.write(instance_definition) + + # Add wrapper objects within initialize() function + self.configuration_file.write(SubstituteTemplate(self.initialize_function_template, { + 'configuration_name': self.configuration_name + })) + + for instance_wrapper in self.instance_wrappers: + self.configuration_file.write(instance_wrapper) + + self.configuration_file.write(self.epilogue_template) + self.configuration_file.close() + +################################################################################################### +################################################################################################### + +class EmitGemmSingleKernelWrapper: + def __init__(self, kernel_path, gemm_operation, wrapper_path): + self.kernel_path = kernel_path + self.wrapper_path = wrapper_path + self.operation = gemm_operation + + gemm_wrapper = """ +template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( + const typename Operation_${operation_name}::ElementA* d_A, size_t lda, + const typename Operation_${operation_name}::ElementB* d_B, size_t ldb, + typename Operation_${operation_name}::ElementC* d_C, size_t ldc, + int* workspace, + cutlass::gemm::GemmCoord const& problem_size, + typename Operation_${operation_name}::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, int split_k_slices); +""" + + gemv_wrapper = """ +template void megdnn::cuda::cutlass_wrapper:: + cutlass_vector_matrix_mul_batched_strided_wrapper( + BatchedGemmCoord const& problem_size, + const typename Operation_${operation_name}::ElementA* d_A, size_t lda, size_t batch_stride_a, + const typename Operation_${operation_name}::ElementB* d_B, size_t ldb, size_t batch_stride_b, + typename Operation_${operation_name}::ElementCD* d_C, size_t ldc, size_t batch_stride_c, + cudaStream_t stream); +""" + + if self.operation.gemm_kind == GemmKind.SplitKParallel or \ + self.operation.gemm_kind == GemmKind.Gemm: + self.wrapper_template = gemm_wrapper + else: + assert self.operation.gemm_kind == GemmKind.GemvBatchedStrided + self.wrapper_template = gemv_wrapper + + instance_emitters = { + GemmKind.Gemm: EmitGemmInstance(), + GemmKind.SplitKParallel: EmitGemmSplitKParallelInstance(), + GemmKind.GemvBatchedStrided: EmitGemvBatchedStridedInstance(), + } + self.instance_emitter = instance_emitters[self.operation.gemm_kind] + + self.header_template = """ +#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include "${wrapper_path}" +""" + self.instance_template = """ +${operation_instance} +""" + + self.epilogue_template = """ +#pragma GCC diagnostic pop +#endif +""" + # + def __enter__(self): + self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) + self.kernel_file = LazyFile(self.kernel_path) + self.kernel_file.write(SubstituteTemplate(self.header_template, { + 'wrapper_path': self.wrapper_path, + })) + return self + + # + def emit(self): + self.kernel_file.write(SubstituteTemplate(self.instance_template, { + 'operation_instance': self.instance_emitter.emit(self.operation), + })) + + # emit wrapper + wrapper = SubstituteTemplate(self.wrapper_template, { + 'operation_name': self.operation.procedural_name(), + }) + self.kernel_file.write(wrapper) + + # + def __exit__(self, exception_type, exception_value, traceback): + self.kernel_file.write(self.epilogue_template) + self.kernel_file.close() + + +################################################################################################### +################################################################################################### + diff --git a/dnn/scripts/cutlass_generator/gen_list.py b/dnn/scripts/cutlass_generator/gen_list.py new file mode 100644 index 00000000..08ca0e88 --- /dev/null +++ b/dnn/scripts/cutlass_generator/gen_list.py @@ -0,0 +1,38 @@ +from generator import ( + GenerateGemmOperations, + GenerateGemvOperations, + GenerateConv2dOperations, + GenerateDeconvOperations, +) + + +class GenArg: + def __init__(self, gen_op, gen_type): + self.operations = gen_op + self.type = gen_type + + +def write_op_list(f, gen_op, gen_type): + if gen_op == "gemm": + operations = GenerateGemmOperations(GenArg(gen_op, gen_type)) + elif gen_op == "gemv": + operations = GenerateGemvOperations(GenArg(gen_op, gen_type)) + elif gen_op == "conv2d": + operations = GenerateConv2dOperations(GenArg(gen_op, gen_type)) + elif gen_op == "deconv": + operations = GenerateDeconvOperations(GenArg(gen_op, gen_type)) + for op in operations: + f.write(' "%s.cu",\n' % op.procedural_name()) + + +if __name__ == "__main__": + with open("list.bzl", "w") as f: + f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n") + f.write("cutlass_gen_list = [\n") + write_op_list(f, "gemm", "simt") + write_op_list(f, "gemv", "simt") + write_op_list(f, "deconv", "simt") + write_op_list(f, "conv2d", "simt") + write_op_list(f, "conv2d", "tensorop8816") + write_op_list(f, "conv2d", "tensorop8832") + f.write("]") diff --git a/dnn/scripts/cutlass_generator/generator.py b/dnn/scripts/cutlass_generator/generator.py new file mode 100644 index 00000000..61117b45 --- /dev/null +++ b/dnn/scripts/cutlass_generator/generator.py @@ -0,0 +1,651 @@ +# +# \file generator.py +# +# \brief Generates the CUTLASS Library's instances +# + +import enum +import os.path +import shutil +import argparse + +from library import * +from manifest import * +################################################################################################### + +# +def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0): + + # by default, use the latest CUDA Toolkit version + cuda_version = [11, 0, 132] + + # Update cuda_version based on parsed string + if semantic_ver_string != '': + for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')]): + if i < len(cuda_version): + cuda_version[i] = x + else: + cuda_version.append(x) + return cuda_version >= [major, minor, patch] + + +################################################################################################### +################################################################################################### + +# +def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \ + alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \ + swizzling_functor = SwizzlingFunctor.Identity8): + + if complex_transforms is None: + complex_transforms = [(ComplexTransform.none, ComplexTransform.none),] + + element_a, element_b, element_c, element_epilogue = data_type + + operations = [] + + # by default, only generate the largest tile and largest alignment + if manifest.args.kernels == '': + tile_descriptions = [tile_descriptions[0],] + alignment_constraints = [alignment_constraints[0],] + + for layout in layouts: + for tile_description in tile_descriptions: + for alignment in alignment_constraints: + for complex_transform in complex_transforms: + + alignment_c = min(8, alignment) + + A = TensorDescription(element_a, layout[0], alignment, complex_transform[0]) + B = TensorDescription(element_b, layout[1], alignment, complex_transform[1]) + C = TensorDescription(element_c, layout[2], alignment_c) + + new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \ + tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor) + + manifest.append(new_operation) + operations.append(new_operation) + + return operations + +########################################################################################################### +# ConvolutionOperator support variations +# ____________________________________________________________________ +# ConvolutionalOperator | Analytic | Optimized +# ____________________________________________________________________ +# | Fprop | (strided) | (strided) +# | Dgrad | (strided, unity*) | (unity) +# | Wgrad | (strided) | (strided) +# ____________________________________________________________________ +# +# Note : Operator marked (*) are supported but not generated to keep the instantiated kernel count low +########################################################################################################### +# Convolution for 2D operations +def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment, \ + conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination): + + element_a, element_b, element_c, element_epilogue = data_type + + # one exceptional case + alignment_c = min(8, alignment) + + # iterator algorithm (analytic and optimized) + iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized] + + # by default, only generate the largest tile size + if manifest.args.kernels == '': + tile_descriptions = [tile_descriptions[0],] + + operations = [] + + for tile in tile_descriptions: + for conv_kind in conv_kinds: + for iterator_algorithm in iterator_algorithms: + A = TensorDescription(element_a, layout[0], alignment) + B = TensorDescription(element_b, layout[1], alignment) + C = TensorDescription(element_c, layout[2], alignment_c) + + # unity stride only for Optimized Dgrad + if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad): + new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ + A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor) + + manifest.append(new_operation) + operations.append(new_operation) + + # strided dgrad is not supported by Optimized Dgrad + if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad): + continue + + # strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic) + new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ + A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor) + + manifest.append(new_operation) + operations.append(new_operation) + + return operations + +################################################################################################### +################################################################################################### + +def GenerateConv2d_Simt(args): + operations = [] + + layouts = [ + (LayoutType.TensorNC4HW4, LayoutType.TensorC4RSK4), + ] + + math_instructions = [ + MathInstruction( \ + [1, 1, 4], \ + DataType.s8, DataType.s8, DataType.s32, \ + OpcodeClass.Simt, \ + MathOperation.multiply_add), + ] + + dst_layouts = [ + LayoutType.TensorNC4HW4, + LayoutType.TensorNC32HW32, + LayoutType.TensorNHWC, + LayoutType.TensorNHWC, + LayoutType.TensorNCHW + ] + + dst_types = [ + DataType.s8, + DataType.s8, + DataType.u4, + DataType.s4, + DataType.f32, + ] + + max_cc = 1024 + + for math_inst in math_instructions: + for layout in layouts: + for dst_type, dst_layout in zip(dst_types, dst_layouts): + if dst_type == DataType.s4 or dst_type == DataType.u4: + min_cc = 75 + skip_unity_kernel = True + else: + min_cc = 61 + skip_unity_kernel = False + tile_descriptions = [ + TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 64, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), + ] + operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], + dst_layout, dst_type, min_cc, 32, 32, 32, + skip_unity_kernel) + return operations + + +def GenerateConv2d_TensorOp_8816(args): + operations = [] + + layouts = [ + (LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32), + ] + + math_instructions = [ + MathInstruction( \ + [8, 8, 16], \ + DataType.s8, DataType.s8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + ] + + dst_layouts = [ + LayoutType.TensorNC32HW32, + LayoutType.TensorNC4HW4, + ] + + dst_types = [ + DataType.s8, + DataType.s8, + ] + + min_cc = 75 + max_cc = 1024 + + for math_inst in math_instructions: + for layout in layouts: + for dst_type, dst_layout in zip(dst_types, dst_layouts): + if dst_layout == LayoutType.TensorNC32HW32: + tile_descriptions = [ + TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 64, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc), + ] + else: + assert dst_layout == LayoutType.TensorNC4HW4 + tile_descriptions = [ + TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), + ] + operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], + dst_layout, dst_type, min_cc, 128, 128, 64, + False) + return operations + +def GenerateConv2d_TensorOp_8832(args): + operations = [] + + layouts = [ + (LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64), + ] + + math_instructions = [ + MathInstruction( \ + [8, 8, 32], \ + DataType.s4, DataType.s4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), \ + MathInstruction( \ + [8, 8, 32], \ + DataType.s4, DataType.u4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate) + ] + + dst_layouts = [ + LayoutType.TensorNC64HW64, + ] + + min_cc = 75 + max_cc = 1024 + + for math_inst in math_instructions: + for layout in layouts: + for dst_layout in dst_layouts: + dst_type = math_inst.element_b + tile_descriptions = [ + TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc), + ] + operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], + dst_layout, dst_type, min_cc, 128, 128, 64, + True) + + layouts_nhwc = [ + (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), + (LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 64), + (LayoutType.TensorNHWC, LayoutType.TensorNC32HW32, 128), + ] + + dst_layouts_nhwc = [ + LayoutType.TensorNHWC, + ] + + for math_inst in math_instructions: + for layout in layouts_nhwc: + for dst_layout in dst_layouts_nhwc: + dst_type = math_inst.element_b + tile_descriptions = [ + TileDescription([128, 32, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc), + ] + operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], + dst_layout, dst_type, min_cc, layout[2], layout[2], 32, + False, ImplicitGemmMode.GemmTn) + return operations + +def GenerateDeconv_Simt(args): + operations = [] + + layouts = [ + (LayoutType.TensorNC4HW4, LayoutType.TensorK4RSC4), + ] + + math_instructions = [ + MathInstruction( \ + [1, 1, 4], \ + DataType.s8, DataType.s8, DataType.s32, \ + OpcodeClass.Simt, \ + MathOperation.multiply_add), + ] + + dst_layouts = [ + LayoutType.TensorNC4HW4, + ] + + dst_types = [ + DataType.s8, + ] + + min_cc = 61 + max_cc = 1024 + + for math_inst in math_instructions: + for layout in layouts: + for dst_type, dst_layout in zip(dst_types, dst_layouts): + tile_descriptions = [ + TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), + ] + operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1], + dst_layout, dst_type, min_cc, 32, 32, 32, + True) + return operations + +################################################################################ +# parameters +# Edge - for tiles, the edges represent the length of one side +# Ratio - the maximum ratio between 2 edges, limits the skinnyness of tiles +# MaxEdge - maximum length of each edge +# Min/Max - minimum/maximum of the product of edge lengths +################################################################################ + +warpsPerThreadblockEdge = [1, 2, 4, 8, 16] +warpsPerThreadblockRatio = 2 +warpsPerThreadblockMax = 16 +# NOTE 1x32 and 2x16 warp tile shapes fail validation for ~10% of cases + +warpShapeEdges = [8, 16, 32, 64, 128, 256] +warpShapeRatio = 4 +warpShapeMax = 64*64 +warpShapeMin = 8*8 + +threadblockEdgeMax = 256 + +# char, type bits/elem, max tile, L0 threadblock tiles +precisions = { + "c" : [ "cutlass::complex", 64, 64*128, [ [ 64, 128], [ 64, 32] ] ], + "d" : [ "double", 64, 64*64, [ [ 64, 64], [ 32, 32] ] ], + "h" : [ "cutlass::half_t", 16, 128*256, [ [256, 128], [ 64, 128], [ 64, 32] ] ], + "i" : [ "int", 32, 128*128, [ [128, 64], [ 16, 32] ] ], + "s" : [ "float", 32, 128*128, [ [128, 256], [128, 128], [ 64, 64] ] ], + "z" : [ "cutlass::complex", 128, 64*64, [ [ 32, 64], [ 16, 32] ] ], +} +# L1 will have a single kernel for every unique shape +# L2 will have everything else +def GenerateGemm_Simt(args): + ################################################################################ + # warps per threadblock + ################################################################################ + warpsPerThreadblocks = [] + for warpsPerThreadblock0 in warpsPerThreadblockEdge: + for warpsPerThreadblock1 in warpsPerThreadblockEdge: + if warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio \ + and warpsPerThreadblock1 / warpsPerThreadblock0 <= warpsPerThreadblockRatio \ + and warpsPerThreadblock0 * warpsPerThreadblock1 <= warpsPerThreadblockMax: + warpsPerThreadblocks.append([warpsPerThreadblock0, + warpsPerThreadblock1]) + + ################################################################################ + # warp shapes + ################################################################################ + warpNumThreads = 32 + warpShapes = [] + for warp0 in warpShapeEdges: + for warp1 in warpShapeEdges: + if warp0 / warp1 <= warpShapeRatio \ + and warp1 / warp0 <= warpShapeRatio \ + and warp0 * warp1 <= warpShapeMax \ + and warp0*warp1 > warpShapeMin: + warpShapes.append([warp0, warp1]) + + # sgemm + precisionType, precisionBits, threadblockMaxElements, threadblockTilesL0 = precisions["s"] + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # nn + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor), # nt + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # tn + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), # tt + ] + + math_instructions = [ + MathInstruction( \ + [1, 1, 1], \ + DataType.f32, DataType.f32, DataType.f32, \ + OpcodeClass.Simt, \ + MathOperation.multiply_add), + ] + + min_cc = 50 + max_cc = 1024 + + operations = [] + for math_inst in math_instructions: + for layout in layouts: + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + tile_descriptions = [ + TileDescription([64, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 32, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 8, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 16, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 16, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc), + ] + for warpsPerThreadblock in warpsPerThreadblocks: + for warpShape in warpShapes: + warpThreadsM = 0 + if warpShape[0] > warpShape[1]: + warpThreadsM = 8 + else: + warpThreadsM = 4 + warpThreadsN = warpNumThreads / warpThreadsM + + # skip shapes with conflicting rectangularity + # they are unlikely to be fastest + blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1] + blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1] + warpG = warpShape[0] > warpShape[1] + warpL = warpShape[0] < warpShape[1] + + blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1]*2 + blockL2 = warpsPerThreadblock[0]*2 < warpsPerThreadblock[1] + warpG2 = warpShape[0] > warpShape[1]*2 + warpL2 = warpShape[0]*2 < warpShape[1] + + if blockG2 and warpL: continue + if blockL2 and warpG: continue + if warpG2 and blockL: continue + if warpL2 and blockG: continue + + # check threadblock ratios and max + threadblockTile = [warpShape[0]*warpsPerThreadblock[0], + warpShape[1]*warpsPerThreadblock[1]] + if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements: continue + if threadblockTile[0] > threadblockEdgeMax: continue + if threadblockTile[1] > threadblockEdgeMax: continue + totalThreads = warpNumThreads*warpsPerThreadblock[0]*warpsPerThreadblock[1] + + # calculate unroll + # ensure that every iteration at least a full load of A,B are done + unrollMin = 8 + unrollMin0 = totalThreads // threadblockTile[0] + unrollMin1 = totalThreads // threadblockTile[1] + unroll = max(unrollMin, unrollMin0, unrollMin1) + + threadTileM = warpShape[0] // warpThreadsM + threadTileN = warpShape[1] // warpThreadsN + if threadTileM < 2 or threadTileN < 2: continue + if threadTileM*threadTileN*precisionBits > 8*8*32: continue + + # epilogue currently only supports N < WarpNumThreads + if threadblockTile[1] < warpNumThreads: continue + + # limit smem + smemBitsA = threadblockTile[0]*unroll*2*precisionBits + smemBitsB = threadblockTile[1]*unroll*2*precisionBits + smemKBytes = (smemBitsA+smemBitsB)/8/1024 + if (smemKBytes > 48): continue + + tile = TileDescription([threadblockTile[0], threadblockTile[1], unroll], \ + 2, \ + [threadblockTile[0]//warpShape[0], threadblockTile[1]//warpShape[1], 1], \ + math_inst, min_cc, max_cc) + + def filter(t: TileDescription) -> bool: + nonlocal tile + return t.threadblock_shape[0] == tile.threadblock_shape[0] and \ + t.threadblock_shape[1] == tile.threadblock_shape[1] and \ + t.threadblock_shape[2] == tile.threadblock_shape[2] and \ + t.warp_count[0] == tile.warp_count[0] and \ + t.warp_count[1] == tile.warp_count[1] and \ + t.warp_count[2] == tile.warp_count[2] and \ + t.stages == tile.stages + if not any(t for t in tile_descriptions if filter(t)): continue + + operations += GeneratesGemm(tile, data_type, layout[0], layout[1], layout[2], min_cc) + return operations + +# +def GenerateGemv_Simt(args): + threadBlockShape_N = [128, 64, 32] + ldgBits_A = [128, 64, 32] + ldgBits_B = [128, 64, 32] + + layouts = [ + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), + ] + + math_instructions = [ + MathInstruction( \ + [1, 1, 1], \ + DataType.f32, DataType.f32, DataType.f32, \ + OpcodeClass.Simt, \ + MathOperation.multiply_add), + ] + + min_cc = 50 + + operations = [] + for math_inst in math_instructions: + for layout in layouts: + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + for threadblock_shape_n in threadBlockShape_N: + for align_a in ldgBits_A: + for align_b in ldgBits_B: + ldg_elements_a = align_a // DataTypeSize[math_inst.element_a] + ldg_elements_b = align_b // DataTypeSize[math_inst.element_b] + threadblock_shape_k = (256 * ldg_elements_a) // (threadblock_shape_n // ldg_elements_b) + threadblock_shape = [1, threadblock_shape_n, threadblock_shape_k] + thread_shape = [1, ldg_elements_b, ldg_elements_a] + + operations.append(GeneratesGemv(math_inst, \ + threadblock_shape, \ + thread_shape, \ + data_type, \ + layout[0], \ + layout[1], \ + layout[2], \ + min_cc, \ + align_a, \ + align_b)) + return operations + +# +def GenerateConv2dOperations(args): + if args.type == "simt": + return GenerateConv2d_Simt(args) + elif args.type == "tensorop8816": + return GenerateConv2d_TensorOp_8816(args) + else: + assert args.type == "tensorop8832", "operation conv2d only support" \ + "simt, tensorop8816 and tensorop8832. (got:{})".format(args.type) + return GenerateConv2d_TensorOp_8832(args) + +def GenerateDeconvOperations(args): + assert args.type == "simt", "operation deconv only support" \ + "simt. (got:{})".format(args.type) + return GenerateDeconv_Simt(args) + +def GenerateGemmOperations(args): + assert args.type == "simt", "operation gemm only support" \ + "simt. (got:{})".format(args.type) + return GenerateGemm_Simt(args) + +def GenerateGemvOperations(args): + assert args.type == "simt", "operation gemv only support" \ + "simt. (got:{})".format(args.type) + return GenerateGemv_Simt(args) + +################################################################################################### +################################################################################################### + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels") + parser.add_argument("--operations", type=str, choices=['gemm', 'gemv', 'conv2d', 'deconv'], + required=True, help="Specifies the operation to generate (gemm, gemv, conv2d, deconv)") + parser.add_argument("output", type=str, help="output directory for CUTLASS kernel files") + parser.add_argument("--type", type=str, choices=['simt', 'tensorop8816', 'tensorop8832'], + default='simt', help="kernel type of CUTLASS kernel generator") + + operation2wrapper_path = { + "gemm": "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl", \ + "gemv": "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl", \ + "conv2d": "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl", \ + "deconv": "src/cuda/convolution/backward_data/implicit_gemm_deconv_cutlass_wrapper.cuinl", \ + } + + args = parser.parse_args() + + wrapper_path = operation2wrapper_path[args.operations] + if args.operations == "gemm": + operations = GenerateGemmOperations(args) + elif args.operations == "gemv": + operations = GenerateGemvOperations(args) + elif args.operations == "conv2d": + operations = GenerateConv2dOperations(args) + elif args.operations == "deconv": + operations = GenerateDeconvOperations(args) + + + if args.operations == "conv2d" or args.operations == "deconv": + for operation in operations: + with EmitConvSingleKernelWrapper(args.output, operation, wrapper_path) as emitter: + emitter.emit() + elif args.operations == "gemm" or args.operations == "gemv": + for operation in operations: + with EmitGemmSingleKernelWrapper(args.output, operation, wrapper_path) as emitter: + emitter.emit() + +# +################################################################################################### diff --git a/dnn/scripts/cutlass_generator/lazy_file.py b/dnn/scripts/cutlass_generator/lazy_file.py new file mode 100644 index 00000000..d05cb63e --- /dev/null +++ b/dnn/scripts/cutlass_generator/lazy_file.py @@ -0,0 +1,27 @@ +# +# \file lazy_file.py +# +# \brief LazyFile updates the target file only when the content is changed +# in order to avoid generating new cutlass kimpls each time cmake is called +# + +import io +import os + +class LazyFile: + def __init__(self, filename): + self.filename = filename + self.buffer = io.StringIO() + + def write(self, data): + self.buffer.write(str(data)) + + def close(self): + if os.path.isfile(self.filename): + old_data = open(self.filename).read() + else: + old_data = "" + new_data = self.buffer.getvalue() + if old_data != new_data: + with open(self.filename, "w") as f: + f.write(new_data) diff --git a/dnn/scripts/cutlass_generator/library.py b/dnn/scripts/cutlass_generator/library.py new file mode 100644 index 00000000..5e82a295 --- /dev/null +++ b/dnn/scripts/cutlass_generator/library.py @@ -0,0 +1,614 @@ +# +# \file generator.py +# +# \brief Generates the CUTLASS Library's instances +# + +import re + +################################################################################################### + +import enum + +# The following block implements enum.auto() for Python 3.5 variants that don't include it such +# as the default 3.5.2 on Ubuntu 16.04. +# +# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility + +try: + from enum import auto as enum_auto +except ImportError: + __cutlass_library_auto_enum = 0 + def enum_auto() -> int: + global __cutlass_library_auto_enum + i = __cutlass_library_auto_enum + __cutlass_library_auto_enum += 1 + return i + +################################################################################################### + +# +class GeneratorTarget(enum.Enum): + Library = enum_auto() +# +GeneratorTargetNames = { + GeneratorTarget.Library: 'library' +} +# + +################################################################################################### + +# +class DataType(enum.Enum): + b1 = enum_auto() + u4 = enum_auto() + u8 = enum_auto() + u16 = enum_auto() + u32 = enum_auto() + u64 = enum_auto() + s4 = enum_auto() + s8 = enum_auto() + s16 = enum_auto() + s32 = enum_auto() + s64 = enum_auto() + f16 = enum_auto() + bf16 = enum_auto() + f32 = enum_auto() + tf32 = enum_auto() + f64 = enum_auto() + cf16 = enum_auto() + cbf16 = enum_auto() + cf32 = enum_auto() + ctf32 = enum_auto() + cf64 = enum_auto() + cs4 = enum_auto() + cs8 = enum_auto() + cs16 = enum_auto() + cs32 = enum_auto() + cs64 = enum_auto() + cu4 = enum_auto() + cu8 = enum_auto() + cu16 = enum_auto() + cu32 = enum_auto() + cu64 = enum_auto() + invalid = enum_auto() + +# +ShortDataTypeNames = { + DataType.s32: 'i', + DataType.f16: 'h', + DataType.f32: 's', + DataType.f64: 'd', + DataType.cf32: 'c', + DataType.cf64: 'z', +} + +# +DataTypeNames = { + DataType.b1: "b1", + DataType.u4: "u4", + DataType.u8: "u8", + DataType.u16: "u16", + DataType.u32: "u32", + DataType.u64: "u64", + DataType.s4: "s4", + DataType.s8: "s8", + DataType.s16: "s16", + DataType.s32: "s32", + DataType.s64: "s64", + DataType.f16: "f16", + DataType.bf16: "bf16", + DataType.f32: "f32", + DataType.tf32: "tf32", + DataType.f64: "f64", + DataType.cf16: "cf16", + DataType.cbf16: "cbf16", + DataType.cf32: "cf32", + DataType.ctf32: "ctf32", + DataType.cf64: "cf64", + DataType.cu4: "cu4", + DataType.cu8: "cu8", + DataType.cu16: "cu16", + DataType.cu32: "cu32", + DataType.cu64: "cu64", + DataType.cs4: "cs4", + DataType.cs8: "cs8", + DataType.cs16: "cs16", + DataType.cs32: "cs32", + DataType.cs64: "cs64", +} + +DataTypeTag = { + DataType.b1: "cutlass::uint1b_t", + DataType.u4: "cutlass::uint4b_t", + DataType.u8: "uint8_t", + DataType.u16: "uint16_t", + DataType.u32: "uint32_t", + DataType.u64: "uint64_t", + DataType.s4: "cutlass::int4b_t", + DataType.s8: "int8_t", + DataType.s16: "int16_t", + DataType.s32: "int32_t", + DataType.s64: "int64_t", + DataType.f16: "cutlass::half_t", + DataType.bf16: "cutlass::bfloat16_t", + DataType.f32: "float", + DataType.tf32: "cutlass::tfloat32_t", + DataType.f64: "double", + DataType.cf16: "cutlass::complex", + DataType.cbf16: "cutlass::complex", + DataType.cf32: "cutlass::complex", + DataType.ctf32: "cutlass::complex", + DataType.cf64: "cutlass::complex", + DataType.cu4: "cutlass::complex", + DataType.cu8: "cutlass::complex", + DataType.cu16: "cutlass::complex", + DataType.cu32: "cutlass::complex", + DataType.cu64: "cutlass::complex", + DataType.cs4: "cutlass::complex", + DataType.cs8: "cutlass::complex", + DataType.cs16: "cutlass::complex", + DataType.cs32: "cutlass::complex", + DataType.cs64: "cutlass::complex", +} + +DataTypeSize = { + DataType.b1: 1, + DataType.u4: 4, + DataType.u8: 4, + DataType.u16: 16, + DataType.u32: 32, + DataType.u64: 64, + DataType.s4: 4, + DataType.s8: 8, + DataType.s16: 16, + DataType.s32: 32, + DataType.s64: 64, + DataType.f16: 16, + DataType.bf16: 16, + DataType.f32: 32, + DataType.tf32: 32, + DataType.f64: 64, + DataType.cf16: 32, + DataType.cbf16: 32, + DataType.cf32: 64, + DataType.ctf32: 32, + DataType.cf64: 128, + DataType.cu4: 8, + DataType.cu8: 16, + DataType.cu16: 32, + DataType.cu32: 64, + DataType.cu64: 128, + DataType.cs4: 8, + DataType.cs8: 16, + DataType.cs16: 32, + DataType.cs32: 64, + DataType.cs64: 128, +} + +################################################################################################### + +# +class ComplexTransform(enum.Enum): + none = enum_auto() + conj = enum_auto() + +# +ComplexTransformTag = { + ComplexTransform.none: 'cutlass::ComplexTransform::kNone', + ComplexTransform.conj: 'cutlass::ComplexTransform::kConjugate', +} + +# +RealComplexBijection = [ + (DataType.f16, DataType.cf16), + (DataType.f32, DataType.cf32), + (DataType.f64, DataType.cf64), +] + +# +def is_complex(data_type): + for r, c in RealComplexBijection: + if data_type == c: + return True + return False + +# +def get_complex_from_real(real_type): + for r, c in RealComplexBijection: + if real_type == r: + return c + return DataType.invalid + +# +def get_real_from_complex(complex_type): + for r, c in RealComplexBijection: + if complex_type == c: + return r + return DataType.invalid + +# +class ComplexMultiplyOp(enum.Enum): + multiply_add = enum_auto() + gaussian = enum_auto() + +################################################################################################### + +# +class MathOperation(enum.Enum): + multiply_add = enum_auto() + multiply_add_saturate = enum_auto() + xor_popc = enum_auto() + multiply_add_fast_bf16 = enum_auto() + multiply_add_fast_f16 = enum_auto() + multiply_add_complex = enum_auto() + multiply_add_complex_gaussian = enum_auto() + +# +MathOperationTag = { + MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd', + MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate', + MathOperation.xor_popc: 'cutlass::arch::OpXorPopc', + MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16', + MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16', + MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex', + MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex', +} + +################################################################################################### + +# +class LayoutType(enum.Enum): + ColumnMajor = enum_auto() + RowMajor = enum_auto() + ColumnMajorInterleaved2 = enum_auto() + RowMajorInterleaved2 = enum_auto() + ColumnMajorInterleaved32 = enum_auto() + RowMajorInterleaved32 = enum_auto() + ColumnMajorInterleaved64 = enum_auto() + RowMajorInterleaved64 = enum_auto() + TensorNHWC = enum_auto() + TensorNDHWC = enum_auto() + TensorNCHW = enum_auto() + TensorNGHWC = enum_auto() + TensorNC4HW4 = enum_auto() + TensorC4RSK4 = enum_auto() + TensorNC8HW8 = enum_auto() + TensorNC16HW16 = enum_auto() + TensorNC32HW32 = enum_auto() + TensorNC64HW64 = enum_auto() + TensorC32RSK32 = enum_auto() + TensorC64RSK64 = enum_auto() + TensorK4RSC4 = enum_auto() + +# +LayoutTag = { + LayoutType.ColumnMajor: 'cutlass::layout::ColumnMajor', + LayoutType.RowMajor: 'cutlass::layout::RowMajor', + LayoutType.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>', + LayoutType.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>', + LayoutType.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>', + LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>', + LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>', + LayoutType.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>', + LayoutType.TensorNHWC: 'cutlass::layout::TensorNHWC', + LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC', + LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW', + LayoutType.TensorNGHWC: 'cutlass::layout::TensorNGHWC', + LayoutType.TensorNC4HW4: 'cutlass::layout::TensorNCxHWx<4>', + LayoutType.TensorC4RSK4: 'cutlass::layout::TensorCxRSKx<4>', + LayoutType.TensorNC8HW8: 'cutlass::layout::TensorNCxHWx<8>', + LayoutType.TensorNC16HW16: 'cutlass::layout::TensorNCxHWx<16>', + LayoutType.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>', + LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>', + LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>', + LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>', + LayoutType.TensorK4RSC4: 'cutlass::layout::TensorKxRSCx<4>', +} + +# +TransposedLayout = { + LayoutType.ColumnMajor: LayoutType.RowMajor, + LayoutType.RowMajor: LayoutType.ColumnMajor, + LayoutType.ColumnMajorInterleaved2: LayoutType.RowMajorInterleaved2, + LayoutType.RowMajorInterleaved2: LayoutType.ColumnMajorInterleaved2, + LayoutType.ColumnMajorInterleaved32: LayoutType.RowMajorInterleaved32, + LayoutType.RowMajorInterleaved32: LayoutType.ColumnMajorInterleaved32, + LayoutType.ColumnMajorInterleaved64: LayoutType.RowMajorInterleaved64, + LayoutType.RowMajorInterleaved64: LayoutType.ColumnMajorInterleaved64, + LayoutType.TensorNHWC: LayoutType.TensorNHWC +} + +# +ShortLayoutTypeNames = { + LayoutType.ColumnMajor: 'n', + LayoutType.ColumnMajorInterleaved32: 'n2', + LayoutType.ColumnMajorInterleaved32: 'n32', + LayoutType.ColumnMajorInterleaved64: 'n64', + LayoutType.RowMajor: 't', + LayoutType.RowMajorInterleaved2: 't2', + LayoutType.RowMajorInterleaved32: 't32', + LayoutType.RowMajorInterleaved64: 't64', + LayoutType.TensorNHWC: 'nhwc', + LayoutType.TensorNDHWC: 'ndhwc', + LayoutType.TensorNCHW: 'nchw', + LayoutType.TensorNGHWC: 'nghwc', + LayoutType.TensorNC4HW4: 'nc4hw4', + LayoutType.TensorC4RSK4: 'c4rsk4', + LayoutType.TensorNC8HW8: 'nc8hw8', + LayoutType.TensorNC16HW16: 'nc16hw16', + LayoutType.TensorNC32HW32: 'nc32hw32', + LayoutType.TensorNC64HW64: 'nc64hw64', + LayoutType.TensorC32RSK32: 'c32rsk32', + LayoutType.TensorC64RSK64: 'c64rsk64', + LayoutType.TensorK4RSC4: 'k4rsc4', +} + +# +ShortComplexLayoutNames = { + (LayoutType.ColumnMajor, ComplexTransform.none): 'n', + (LayoutType.ColumnMajor, ComplexTransform.conj): 'c', + (LayoutType.RowMajor, ComplexTransform.none): 't', + (LayoutType.RowMajor, ComplexTransform.conj): 'h' +} + +################################################################################################### +# +class OpcodeClass(enum.Enum): + Simt = enum_auto() + TensorOp = enum_auto() + WmmaTensorOp = enum_auto() + +OpcodeClassNames = { + OpcodeClass.Simt: 'simt', + OpcodeClass.TensorOp: 'tensorop', + OpcodeClass.WmmaTensorOp: 'wmma_tensorop', +} + +OpcodeClassTag = { + OpcodeClass.Simt: 'cutlass::arch::OpClassSimt', + OpcodeClass.TensorOp: 'cutlass::arch::OpClassTensorOp', + OpcodeClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp', +} + +################################################################################################### + +# +class OperationKind(enum.Enum): + Gemm = enum_auto() + Conv2d = enum_auto() + +# +OperationKindNames = { + OperationKind.Gemm: 'gemm' + , OperationKind.Conv2d: 'conv2d' +} + +# +class Target(enum.Enum): + library = enum_auto() + +ArchitectureNames = { + 50: 'maxwell', + 60: 'pascal', + 61: 'pascal', + 70: 'volta', + 75: 'turing', + 80: 'ampere', +} + +################################################################################################### + +# +def SubstituteTemplate(template, values): + text = template + changed = True + while changed: + changed = False + for key, value in values.items(): + regex = "\\$\\{%s\\}" % key + newtext = re.sub(regex, value, text) + if newtext != text: + changed = True + text = newtext + return text + +################################################################################################### + +# +class GemmKind(enum.Enum): + Gemm = enum_auto() + Sparse = enum_auto() + Universal = enum_auto() + PlanarComplex = enum_auto() + PlanarComplexArray = enum_auto() + SplitKParallel = enum_auto() + GemvBatchedStrided = enum_auto() + +# +GemmKindNames = { + GemmKind.Gemm: "gemm", + GemmKind.Sparse: "spgemm", + GemmKind.Universal: "gemm", + GemmKind.PlanarComplex: "gemm_planar_complex", + GemmKind.PlanarComplexArray: "gemm_planar_complex_array", + GemmKind.SplitKParallel: "gemm_split_k_parallel", + GemmKind.GemvBatchedStrided: "gemv_batched_strided", +} + +# +class EpilogueFunctor(enum.Enum): + LinearCombination = enum_auto() + LinearCombinationClamp = enum_auto() + BiasAddLinearCombination = enum_auto() + BiasAddLinearCombinationRelu = enum_auto() + BiasAddLinearCombinationHSwish = enum_auto() + BiasAddLinearCombinationClamp = enum_auto() + BiasAddLinearCombinationReluClamp = enum_auto() + BiasAddLinearCombinationHSwishClamp = enum_auto() + + +# +EpilogueFunctorTag = { + EpilogueFunctor.LinearCombination: 'cutlass::epilogue::thread::LinearCombination', + EpilogueFunctor.LinearCombinationClamp: 'cutlass::epilogue::thread::LinearCombinationClamp', + EpilogueFunctor.BiasAddLinearCombination: 'cutlass::epilogue::thread::BiasAddLinearCombination', + EpilogueFunctor.BiasAddLinearCombinationRelu: 'cutlass::epilogue::thread::BiasAddLinearCombinationRelu', + EpilogueFunctor.BiasAddLinearCombinationHSwish: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwish', + EpilogueFunctor.BiasAddLinearCombinationClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationClamp', + EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp', + EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp', +} + +# +ShortEpilogueNames = { + EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'hswish', + EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'relu', + EpilogueFunctor.BiasAddLinearCombinationClamp: 'identity', + EpilogueFunctor.BiasAddLinearCombinationHSwish: 'hswish', + EpilogueFunctor.BiasAddLinearCombinationRelu: 'relu', + EpilogueFunctor.BiasAddLinearCombination: 'identity', +} + + + + + + +# +class SwizzlingFunctor(enum.Enum): + Identity1 = enum_auto() + Identity2 = enum_auto() + Identity4 = enum_auto() + Identity8 = enum_auto() + ConvFpropNCxHWx = enum_auto() + ConvFpropNHWC = enum_auto() + ConvDgradNCxHWx = enum_auto() + +# +SwizzlingFunctorTag = { + SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>', + SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>', + SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>', + SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>', + SwizzlingFunctor.ConvFpropNCxHWx: 'cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle', + SwizzlingFunctor.ConvFpropNHWC: 'cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle', + SwizzlingFunctor.ConvDgradNCxHWx: 'cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle', +} + +################################################################################################### + +class ConvType(enum.Enum): + Convolution = enum_auto() + BatchConvolution = enum_auto() + Local = enum_auto() + LocalShare = enum_auto() + +ConvTypeTag = { + ConvType.Convolution: 'cutlass::conv::ConvType::kConvolution', + ConvType.BatchConvolution: 'cutlass::conv::ConvType::kBatchConvolution', + ConvType.Local: 'cutlass::conv::ConvType::kLocal', + ConvType.LocalShare : 'cutlass::conv::ConvType::kLocalShare', +} + +# +class ConvKind(enum.Enum): + Fprop = enum_auto() + Dgrad = enum_auto() + Wgrad = enum_auto() + +# +ConvKindTag = { + ConvKind.Fprop: 'cutlass::conv::Operator::kFprop', + ConvKind.Dgrad: 'cutlass::conv::Operator::kDgrad', + ConvKind.Wgrad: 'cutlass::conv::Operator::kWgrad' +} + +ConvKindNames = { + ConvKind.Fprop: 'fprop', + ConvKind.Dgrad: 'dgrad', + ConvKind.Wgrad: 'wgrad', +} + +# +class IteratorAlgorithm(enum.Enum): + Analytic = enum_auto() + Optimized = enum_auto() + +# +IteratorAlgorithmTag = { + IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic', + IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized', +} + +IteratorAlgorithmNames = { + IteratorAlgorithm.Analytic: 'analytic', + IteratorAlgorithm.Optimized: 'optimized', +} + +# +class StrideSupport(enum.Enum): + Strided = enum_auto() + Unity = enum_auto() + +# +StrideSupportTag = { + StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided', + StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity', +} + +StrideSupportNames = { + StrideSupport.Strided: '', + StrideSupport.Unity: 'unity_stride', +} + +class ImplicitGemmMode(enum.Enum): + GemmNt = enum_auto() + GemmTn = enum_auto() + +ImplicitGemmModeNames = { + ImplicitGemmMode.GemmNt: 'gemm_nt', + ImplicitGemmMode.GemmTn: 'gemm_tn', +} + +ImplicitGemmModeTag = { + ImplicitGemmMode.GemmNt: 'cutlass::conv::ImplicitGemmMode::GEMM_NT', + ImplicitGemmMode.GemmTn: 'cutlass::conv::ImplicitGemmMode::GEMM_TN', +} + +################################################################################################### + +# +class MathInstruction: + def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class, math_operation = MathOperation.multiply_add): + self.instruction_shape = instruction_shape + self.element_a = element_a + self.element_b = element_b + self.element_accumulator = element_accumulator + self.opcode_class = opcode_class + self.math_operation = math_operation + + +# +class TileDescription: + + def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute): + self.threadblock_shape = threadblock_shape + self.stages = stages + self.warp_count = warp_count + self.math_instruction = math_instruction + self.minimum_compute_capability = min_compute + self.maximum_compute_capability = max_compute + + def procedural_name(self): + return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages) + +# +class TensorDescription: + def __init__(self, element, layout, alignment = 1, complex_transform = ComplexTransform.none): + self.element = element + self.layout = layout + self.alignment = alignment + self.complex_transform = complex_transform + +################################################################################################### diff --git a/dnn/scripts/cutlass_generator/list.bzl b/dnn/scripts/cutlass_generator/list.bzl new file mode 100644 index 00000000..d1e10ac5 --- /dev/null +++ b/dnn/scripts/cutlass_generator/list.bzl @@ -0,0 +1,578 @@ +# Generated by dnn/scripts/cutlass_generator/gen_list.py + +cutlass_gen_list = [ + "cutlass_simt_sgemm_8x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_16x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_16x64_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu", + "cutlass_simt_sgemm_32x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_32x64_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu", + "cutlass_simt_sgemm_64x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_16x128_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu", + "cutlass_simt_sgemm_32x128_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu", + "cutlass_simt_sgemm_64x64_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu", + "cutlass_simt_sgemm_128x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_64x128_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu", + "cutlass_simt_sgemm_128x64_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu", + "cutlass_simt_sgemm_32x256_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu", + "cutlass_simt_sgemm_64x256_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu", + "cutlass_simt_sgemm_128x128_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu", + "cutlass_simt_sgemm_256x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu", + "cutlass_simt_sgemm_256x64_8x2_nn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu", + "cutlass_simt_sgemm_8x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_16x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_16x64_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu", + "cutlass_simt_sgemm_32x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_32x64_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu", + "cutlass_simt_sgemm_64x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_16x128_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu", + "cutlass_simt_sgemm_32x128_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu", + "cutlass_simt_sgemm_64x64_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu", + "cutlass_simt_sgemm_128x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_64x128_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu", + "cutlass_simt_sgemm_128x64_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu", + "cutlass_simt_sgemm_32x256_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu", + "cutlass_simt_sgemm_64x256_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu", + "cutlass_simt_sgemm_128x128_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu", + "cutlass_simt_sgemm_256x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu", + "cutlass_simt_sgemm_256x64_8x2_nt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu", + "cutlass_simt_sgemm_8x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_16x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_16x64_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu", + "cutlass_simt_sgemm_32x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_32x64_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu", + "cutlass_simt_sgemm_64x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_16x128_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu", + "cutlass_simt_sgemm_32x128_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu", + "cutlass_simt_sgemm_64x64_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu", + "cutlass_simt_sgemm_128x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_64x128_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu", + "cutlass_simt_sgemm_128x64_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu", + "cutlass_simt_sgemm_32x256_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu", + "cutlass_simt_sgemm_64x256_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu", + "cutlass_simt_sgemm_128x128_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu", + "cutlass_simt_sgemm_256x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu", + "cutlass_simt_sgemm_256x64_8x2_tn_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu", + "cutlass_simt_sgemm_8x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_16x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_16x64_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu", + "cutlass_simt_sgemm_32x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_32x64_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu", + "cutlass_simt_sgemm_64x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_16x128_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu", + "cutlass_simt_sgemm_32x128_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu", + "cutlass_simt_sgemm_64x64_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu", + "cutlass_simt_sgemm_128x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_64x128_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu", + "cutlass_simt_sgemm_128x64_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu", + "cutlass_simt_sgemm_32x256_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu", + "cutlass_simt_sgemm_64x256_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu", + "cutlass_simt_sgemm_128x128_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu", + "cutlass_simt_sgemm_256x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu", + "cutlass_simt_sgemm_256x64_8x2_tt_align1.cu", + "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu", + "cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu", + "cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu", + "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu", + "cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu", + "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu", + "cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu", + "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu", + "cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu", + "cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu", + "cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu", + "cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu", + "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu", + "cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu", + "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu", + "cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu", + "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu", + "cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu", + "cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu", + "cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu", + "cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu", + "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu", + "cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu", + "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu", + "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu", + "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu", + "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu", + "cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu", + "cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu", + "cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu", + "cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu", + "cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu", + "cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu", + "cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", + "cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", + "cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", + "cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", + "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", + "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", + "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", + "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", + "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", + "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", + "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", + "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", + "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", + "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", + "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", + "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", + "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", + "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", +] \ No newline at end of file diff --git a/dnn/scripts/cutlass_generator/manifest.py b/dnn/scripts/cutlass_generator/manifest.py new file mode 100644 index 00000000..57333f39 --- /dev/null +++ b/dnn/scripts/cutlass_generator/manifest.py @@ -0,0 +1,351 @@ +# +# \file generator.py +# +# \brief Generates the CUTLASS Library's instances +# + +import enum +import os.path +import shutil + +from library import * +from gemm_operation import * +from conv2d_operation import * + +################################################################################################### + +class EmitOperationKindLibrary: + def __init__(self, generated_path, kind, args): + self.generated_path = generated_path + self.kind = kind + self.args = args + + self.emitters = { + OperationKind.Gemm: EmitGemmConfigurationLibrary + , OperationKind.Conv2d: EmitConv2dConfigurationLibrary + } + + self.configurations = []; + + self.header_template =""" +/* + Generated by manifest.py - Do not edit. +*/ + +#include "cutlass/cutlass.h" +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +namespace cutlass { +namespace library { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +""" + self.entry_template = """ + +// +// Entry point to construct operations +// +void initialize_all_${operation_name}_operations(Manifest &manifest) { +""" + self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n" + self.configuration_template =" initialize_${configuration_name}(manifest);\n" + + self.epilogue_template =""" + +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace library +} // namespace cutlass + +""" + + # + def __enter__(self): + self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind]) + os.mkdir(self.operation_path) + + self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind]) + + self.top_level_file = open(self.top_level_path, "w") + self.top_level_file.write(self.header_template) + + self.source_files = [self.top_level_path,] + + return self + + # + def emit(self, configuration_name, operations): + + with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter: + for operation in operations: + configuration_emitter.emit(operation) + + self.source_files.append(configuration_emitter.configuration_path) + + self.configurations.append(configuration_name) + self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} )) + + # + def __exit__(self, exception_type, exception_value, traceback): + self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]})) + + for configuration_name in self.configurations: + self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name})) + + self.top_level_file.write(self.epilogue_template) + self.top_level_file.close() + +################################################################################################### +################################################################################################### + +class Options: + def __init__(self): + pass + +################################################################################################### + +# +class Manifest: + + # + def __init__(self, args): + self.operations = {} + self.args = args + + architectures = args.architectures.split(';') if len(args.architectures) else ['50',] + self.compute_capabilities = [int(x) for x in architectures] + + self.selected_kernels = [] + + if args.operations == 'all': + self.operations_enabled = [] + else: + + operations_list = [ + OperationKind.Gemm + , OperationKind.Conv2d + ] + + self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')] + + if args.kernels == 'all': + self.kernel_names = [] + else: + self.kernel_names = [x for x in args.kernels.split(',') if x != ''] + + self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != ''] + + if args.kernel_filter_file is None: + self.kernel_filter_list = [] + else: + self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file) + + + self.operation_count = 0 + self.operations_by_name = {} + self.top_level_prologue = ''' + +#include "cutlass/library/library.h" +#include "cutlass/library/manifest.h" + +namespace cutlass { +namespace library { + +${prototypes} + +void initialize_all(Manifest &manifest) { + +''' + self.top_level_reserve = ' manifest.reserve(${operation_count});\n\n' + self.top_level_epilogue = ''' +} + +} // namespace library +} // namespace cutlass + +''' + + + def get_kernel_filters (self, kernelListFile): + if os.path.isfile(kernelListFile): + with open(kernelListFile, 'r') as fileReader: + lines = [line.rstrip() for line in fileReader if not line.startswith("#")] + + lines = [re.compile(line) for line in lines if line] + return lines + else: + return [] + + + + def filter_out_kernels(self, kernel_name, kernel_filter_list): + + for kernel_filter_re in kernel_filter_list: + if kernel_filter_re.search(kernel_name) is not None: + return True + + return False + + + # + def _filter_string_matches(self, filter_string, haystack): + ''' Returns true if all substrings appear in the haystack in order''' + substrings = filter_string.split('*') + for sub in substrings: + idx = haystack.find(sub) + if idx < 0: + return False + haystack = haystack[idx + len(sub):] + return True + + # + def filter(self, operation): + ''' Filtering operations based on various criteria''' + + # filter based on compute capability + enabled = False + for cc in self.compute_capabilities: + if cc >= operation.tile_description.minimum_compute_capability and \ + cc <= operation.tile_description.maximum_compute_capability: + + enabled = True + break + + if not enabled: + return False + + if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled: + return False + + # eliminate duplicates + if operation.procedural_name() in self.operations_by_name.keys(): + return False + + # Filter based on list of valid substrings + if len(self.kernel_names): + name = operation.procedural_name() + enabled = False + + # compare against the include list + for name_substr in self.kernel_names: + if self._filter_string_matches(name_substr, name): + enabled = True + break + + # compare against the exclude list + for name_substr in self.ignore_kernel_names: + if self._filter_string_matches(name_substr, name): + enabled = False + break + + if len(self.kernel_filter_list) > 0: + enabled = False + if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list): + enabled = True + + + # todo: filter based on compute data type + return enabled + # + + # + def append(self, operation): + ''' + Inserts the operation. + + operation_kind -> configuration_name -> [] + ''' + + if self.filter(operation): + + self.selected_kernels.append(operation.procedural_name()) + + self.operations_by_name[operation.procedural_name()] = operation + + # add the configuration + configuration_name = operation.configuration_name() + + if operation.operation_kind not in self.operations.keys(): + self.operations[operation.operation_kind] = {} + + if configuration_name not in self.operations[operation.operation_kind].keys(): + self.operations[operation.operation_kind][configuration_name] = [] + + self.operations[operation.operation_kind][configuration_name].append(operation) + self.operation_count += 1 + # + + # + def emit(self, target = GeneratorTarget.Library): + + operation_emitters = { + GeneratorTarget.Library: EmitOperationKindLibrary + } + + generated_path = os.path.join(self.args.curr_build_dir, 'generated') + + # create generated/ + if os.path.exists(generated_path): + shutil.rmtree(generated_path) + + os.mkdir(generated_path) + + source_files = [] + + top_level_path = os.path.join(generated_path, 'initialize_all.cpp') + with open(top_level_path, 'w') as top_level_file: + + if target == GeneratorTarget.Library: + source_files.append(top_level_path) + + prototypes = [] + for operation_kind, configurations in self.operations.items(): + prototypes.append(SubstituteTemplate( + "void initialize_all_${operation_kind}_operations(Manifest &manifest);", + {'operation_kind': OperationKindNames[operation_kind]})) + + top_level_file.write(SubstituteTemplate(self.top_level_prologue, + {'prototypes': "\n".join(prototypes)})) + + top_level_file.write(SubstituteTemplate( + self.top_level_reserve, {'operation_count': str(self.operation_count)})) + + # for each operation kind, emit initializer for all configurations + for operation_kind, configurations in self.operations.items(): + + with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter: + for configuration_name, operations in configurations.items(): + operation_kind_emitter.emit(configuration_name, operations) + + source_files += operation_kind_emitter.source_files + + top_level_file.write(SubstituteTemplate( + " initialize_all_${operation_kind}_operations(manifest);\n", + {'operation_kind': OperationKindNames[operation_kind]})) + + top_level_file.write(self.top_level_epilogue) + + # write the manifest.cmake file containing paths from all targets + manifest_path = os.path.join(generated_path, "manifest.cmake") + with open(manifest_path, "w") as manifest_file: + + target_name = 'cutlass_library_objs' + + target_text = SubstituteTemplate("""cutlass_target_sources( + ${target_name} + BATCH_SOURCES ON + PRIVATE +""", { 'target_name': target_name}) + + manifest_file.write(target_text) + + for source_file in source_files: + manifest_file.write(" %s\n" % str(source_file.replace('\\', '/'))) + manifest_file.write(")") + # + +################################################################################################### diff --git a/dnn/src/CMakeLists.txt b/dnn/src/CMakeLists.txt index c1990c4b..fb1d59c9 100644 --- a/dnn/src/CMakeLists.txt +++ b/dnn/src/CMakeLists.txt @@ -113,6 +113,31 @@ if(MGE_WITH_CUDA) list(APPEND SOURCES ${SOURCES_}) file(GLOB_RECURSE CUSOURCES cuda/*.cu) + + set(CUTLASS_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/cutlass_generator/generator.py) + set(CUTLASS_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuda/cutlass/generated) + function(gen_cutlass_kimpl op type) + set(CURRENT_CUTLASS_GEN_DIR ${CUTLASS_GEN_DIR}/${op}_${type}) + file(MAKE_DIRECTORY ${CURRENT_CUTLASS_GEN_DIR}) + execute_process( + COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${CUTLASS_GEN_SCRIPT} --operations ${op} --type ${type} ${CURRENT_CUTLASS_GEN_DIR} + RESULT_VARIABLE gen_cutlass_result + OUTPUT_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log + ERROR_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log + ) + if (NOT gen_cutlass_result EQUAL 0) + message(FATAL_ERROR "Error generating library instances. See ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log") + endif() + endfunction() + gen_cutlass_kimpl(gemm simt) + gen_cutlass_kimpl(gemv simt) + gen_cutlass_kimpl(deconv simt) + gen_cutlass_kimpl(conv2d simt) + gen_cutlass_kimpl(conv2d tensorop8816) + gen_cutlass_kimpl(conv2d tensorop8832) + file(GLOB_RECURSE CUTLASS_SOURCES ${CUTLASS_GEN_DIR}/*.cu) + list(APPEND SOURCES ${CUTLASS_SOURCES}) + list(APPEND SOURCES ${CUSOURCES}) endif() diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index 059b3287..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index ecab6b8c..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index aef4745a..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index bd482580..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 1a20121c..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index 3f19c584..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index c66ac54e..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 45590035..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index 64a3e278..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index d0ead0c3..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 6994ef17..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index 1d9e6c79..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index 952bf9fe..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 83b6a72f..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index 4d1c3231..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index d5d174c6..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index b39a3b38..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index 1a6f8b69..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu deleted file mode 100644 index ef91b066..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - cutlass::int4b_t, - cutlass::layout::TensorCxRSKx<64>, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 128>, - cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 16, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index 574fbba8..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 22ed7316..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index 35db883a..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index 33a25779..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index fc8339b4..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index b4934fb2..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu deleted file mode 100644 index d65d8dca..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - cutlass::int4b_t, - cutlass::layout::TensorCxRSKx<64>, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 128>, - cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 16, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu deleted file mode 100644 index 33aa6e47..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - cutlass::int4b_t, - cutlass::layout::TensorCxRSKx<64>, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 128>, - cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 16, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index 99511641..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 5b023a5c..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index 2a4de50e..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index 0586bbd0..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index b4c9d9ec..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index c5139b49..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu deleted file mode 100644 index 77d958b1..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - cutlass::int4b_t, - cutlass::layout::TensorCxRSKx<64>, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 128>, - cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 16, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu deleted file mode 100644 index 52f4334d..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - cutlass::int4b_t, - cutlass::layout::TensorCxRSKx<64>, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 128>, - cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 16, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index c2d41fdc..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 331686a1..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index e0d8b1e8..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index e0917aa6..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index ba7aacb2..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index 1bb9c179..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu deleted file mode 100644 index 7fc1a3e3..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - cutlass::int4b_t, - cutlass::layout::TensorCxRSKx<64>, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 128>, - cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 16, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index d0e906e7..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 80c9fc67..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index ba1875ff..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index 4502520f..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 2ead7f5d..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index d0f6e113..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index 94a82f82..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 0d17a91e..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index 967b0341..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index f780d1a0..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 8db5b1af..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index e4a28e3e..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu deleted file mode 100644 index db1db695..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNCxHWx<64>, - cutlass::int4b_t, - cutlass::layout::TensorCxRSKx<64>, - cutlass::uint4b_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 128>, - cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 16, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index f6b7d1e7..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 1353ffd1..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index dbdc1bd6..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index a7f2d479..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 7c3a1ef3..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index d4e2e602..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu deleted file mode 100644 index 65b56c5e..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNCxHWx<64>, - cutlass::int4b_t, - cutlass::layout::TensorCxRSKx<64>, - cutlass::uint4b_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 128>, - cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 16, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu deleted file mode 100644 index 80f50629..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNCxHWx<64>, - cutlass::int4b_t, - cutlass::layout::TensorCxRSKx<64>, - cutlass::uint4b_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 128>, - cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 16, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index 0429ab43..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 6f9fbba8..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index e8f1001d..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu deleted file mode 100644 index 35d1f344..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<16>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu deleted file mode 100644 index 9ac7388e..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<32>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu deleted file mode 100644 index 46e1bdea..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - cutlass::int4b_t, - cutlass::layout::TensorNCxHWx<8>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, - 2, - 8, - 8, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_TN>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu b/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu deleted file mode 100644 index 55acfe28..00000000 --- a/dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - cutlass::uint4b_t, - cutlass::layout::TensorNCxHWx<64>, - cutlass::int4b_t, - cutlass::layout::TensorCxRSKx<64>, - cutlass::uint4b_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::layout::TensorNCxHWx<64>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 128>, - cutlass::gemm::GemmShape<64, 64, 128>, - cutlass::gemm::GemmShape<8, 8, 32>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 16, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 32, - 32, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 0188b8c9..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 634732ef..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 10978a20..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index e8ac6042..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 811cdfcd..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index d33a30f0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 6692dfb0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 73e10efc..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 550dabf0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 0ea16d92..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 6949e97f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 7bd8c0a1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index d9560c0e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index f49957b0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 08f98ebd..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 13c20c41..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 116d1981..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 772e9be8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index b2f08485..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 995a362b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 5e1d157f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 7d758dd7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index d0b56d97..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 1963beb0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index cb6b9468..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index f3db7a99..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index f3dc2913..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 2bcac258..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index f40cfa4c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index ad941b12..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 161bbea5..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index e6f63ba8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index b3722c74..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 2990d138..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 60d6b36b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 24f333e4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 4a7c4f82..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 695dbb8d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index ff4780c9..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 3e169cec..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index d5ec49e3..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 7c9903b5..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index bc521eed..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 901fe816..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 70f635f0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 92aaf901..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 7218d5f3..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index d37b1c08..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index e79fca2b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 16bc9200..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index ff3abbd1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 7b51944c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 04846642..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index bf95aad5..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 8d147553..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombination< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index eb0ddf9a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 61265f6b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 197018a8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 95017836..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 85115e8e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index f165fac4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 7986b73a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 10dcd177..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index 2cf55999..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index a1b665d1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu deleted file mode 100644 index bf5a59d7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - float, - cutlass::layout::TensorNCHW, - float, - cutlass::layout::TensorNCHW, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationRelu< - float, - 1, - int32_t, - float, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index c62d96d8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 6455430d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index e63d982c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 696542f2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 1af499ad..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index f70e8d25..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index de958f95..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 3a195832..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 7fd8b76f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 4205a742..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 431b43ca..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 3d6ada6f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 77384ac6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index d30558d8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 3e32e6e5..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index d69a9e9e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index c8678666..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 972816c7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 5117e5c1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index b4b5550a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index c18385a5..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 9725a58a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index d2ab9fd3..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 91885956..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index e0deccc2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 6f7c1dac..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index d256e23a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 649d27ed..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 8e5d4f44..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 799219ab..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index a0fe7fb2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index cf2f55e0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 47634aee..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::int4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::int4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 52a026ba..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 6c59c066..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index e080b713..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 97dae192..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index cbff4aaa..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 7027e886..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu deleted file mode 100644 index 3b8ea536..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 97b4ce07..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 71c0baca..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 61fdf2bf..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 065b4d97..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 45e2ecd1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 66394d5f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 7d0383cb..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index c1104914..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 9216e8c1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 3faa0456..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 2657a4f1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index def60f3d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 8d37fb31..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index f743166d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 5dfe6b54..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 65154c27..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 0aeb15f7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index f02fd855..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 0cd9d768..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu deleted file mode 100644 index 9d1b9824..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 95f37bff..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 3de086be..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index f346d1c1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index cec7576b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 94062d99..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index aa901c37..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index a4581d29..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index f6be5d3f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index b2513493..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 56164c72..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 1a46c48e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 5b40ac21..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 4fd9bb43..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 05117134..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 05b313d0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 5c2f505c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 8efd1931..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 79d51ce2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 371cb6a7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu deleted file mode 100644 index 8d9ef1af..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 6acff4cc..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 35c8fcdf..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index b6cab90e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 818cdad0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index e2e9b4a4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 0c4d8206..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 5f436fa1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index aa49f1be..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 15bdd42a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 79168d31..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index a196862c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index e3d8de90..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 11bd4fd6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - false, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index a1aebf28..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index f1411857..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 45394e91..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 80fe01d4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 3310ab82..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 9afa9cca..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu deleted file mode 100644 index 11ec8089..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index f98c6e3f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 2a169bd1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 943e23c6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index ebe14204..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index fd088482..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 7da9f455..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index d544a0dc..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 901424c0..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 90e777c8..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 685a9d7e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 73bf1d62..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index c6e05459..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 588bbc12..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index dce1e586..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index fa043faa..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index ec624a3a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 545b0478..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 67b97d90..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index ba4e2f2d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu deleted file mode 100644 index aa7116c4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 3471b9fa..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index ceed408d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index b7db1261..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 9fcc2f40..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 94e3fc97..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 76628d7f..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index d0fda4ee..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 287ab3bc..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 7f930475..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 6f884781..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 1925a760..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index b0948fd2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 75789416..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index ea5b2fd7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index bb56f241..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 2a0cf695..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 0d6b624b..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 746dd299..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 6619b613..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu deleted file mode 100644 index 824d2580..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 72d7da12..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index bcc78062..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index b4cb06c1..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 95002a67..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 8ea2592e..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 12b96460..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 8aa4d121..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index a8dfd9fb..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 45e4d7e4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index 1d12d500..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index 2666fe1c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu deleted file mode 100644 index dbba1fc5..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu deleted file mode 100644 index c177dd20..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index fce2a7b2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index b8fe8657..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index e602b4fb..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 9a11ca1d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 3a416f6a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 5e83d253..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index f5ca0a86..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index d4b88bd6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index f83ef4f6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index ab9dd016..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index eda36cc2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index c59981f5..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 8df0ceaf..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index ca939ce4..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index b97524ee..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 29ec4813..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index d440abb2..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index bc064a26..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index e40529af..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index f57da562..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 9891e9d5..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 28823bb6..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 25d4453a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 0ccecb64..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index cae3e75d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index a0d014f7..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 8518aabb..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index e2ca5566..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index a6e75395..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<32, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 0ef0f83a..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 5acd525c..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index 8a70bd48..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu deleted file mode 100644 index f0abfb3d..00000000 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorCxRSKx<4>, - cutlass::uint4b_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::layout::TensorNHWC, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - cutlass::uint4b_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 91ef9b82..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 4327a877..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index f81c5345..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 8cd3d419..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 6eb00a4c..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index de3b726c..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 747240fc..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 1a7151f3..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 092869c9..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 3418a6a6..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<32, 16, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 9ed169ed..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 86f0b034..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index f8e44905..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 77815d13..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 10a95d79..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 95ed7a79..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 45112372..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index e0e78d1f..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 33795436..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 1ab249ba..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index f7e1bbb6..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 8340a760..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 1c00df1e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index a848d1a6..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<32, 16, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 7226dcb7..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 3424de30..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 8056afff..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index c7c0cb91..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index a8605228..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index cf313aa2..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index ad862cfd..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index bf4044e5..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 000c3420..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 94d728ef..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 61936ad4..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index bdded445..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index f4add434..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 01200c9e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<32, 16, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index c00a1f2a..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 53d97da9..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index b9829d3c..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 112feff5..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - false, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index c190a1ed..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index c0b7c567..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 95fd179e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index d33faf86..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 2f1a0e66..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 0d78e2bb..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 3c128645..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index f807bd34..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 89604a7d..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 90271408..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<32, 16, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index a2942ae4..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index e8685e45..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index eb9cf847..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 5bfa7ff9..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 82765158..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index c0ae8376..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 2b66bb58..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index aee8a5d1..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 67e37171..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index a9514f6c..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 37984bc9..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 70ed2f42..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 6a5c7d2e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 98e4034e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<32, 16, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 70708175..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 84a88a6b..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 234db1ec..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 46ed4028..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 07f2e0a7..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index ecf3cfc3..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index ba90c2af..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index e5208560..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 256, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 594b5e66..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 2548d373..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<128, 64, 64>, - cutlass::gemm::GemmShape<64, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 2b3bcc3e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 7e2c8596..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<256, 128, 64>, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 5876b004..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<16, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index d88ee03e..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<32, 16, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index 13ab6aed..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index e0b8a0fd..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 128, 64>, - cutlass::gemm::GemmShape<32, 64, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu deleted file mode 100644 index ea2343b9..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::layout::TensorNCxHWx<32>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 8, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu deleted file mode 100644 index 8a5b7e8b..00000000 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu +++ /dev/null @@ -1,59 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4" generated by cutlass generator -using Convolution = - typename cutlass::conv::device::Convolution< - int8_t, - cutlass::layout::TensorNCxHWx<32>, - int8_t, - cutlass::layout::TensorCxRSKx<32>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::conv::ConvType::kConvolution, - cutlass::arch::OpClassTensorOp, - cutlass::arch::Sm75, - cutlass::gemm::GemmShape<64, 64, 64>, - cutlass::gemm::GemmShape<32, 32, 64>, - cutlass::gemm::GemmShape<8, 8, 16>, - cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, - 2, - 16, - 16, - true, - cutlass::arch::OpMultiplyAddSaturate, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const typename Convolution::ElementSrc* d_src, - const typename Convolution::ElementFilter* d_filter, - const typename Convolution::ElementBias* d_bias, - const typename Convolution::ElementDst* d_z, - typename Convolution::ElementDst* d_dst, - int* workspace, - typename Convolution::ConvolutionParameter const& conv_param, - typename Convolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, - typename Convolution::ExtraParam extra_param); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu deleted file mode 100644 index c0c8a8fa..00000000 --- a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu +++ /dev/null @@ -1,57 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/convolution/backward_data/implicit_gemm_deconv_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4" generated by cutlass generator -using Deconvolution = - typename cutlass::conv::device::Deconvolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorKxRSCx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, - 1, - 4, - 8, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( - const typename Deconvolution::ElementSrc* d_src, - const typename Deconvolution::ElementFilter* d_filter, - const typename Deconvolution::ElementBias* d_bias, - const typename Deconvolution::ElementDst* d_z, - typename Deconvolution::ElementDst* d_dst, - int* workspace, - typename Deconvolution::ConvolutionParameter const& conv_param, - typename Deconvolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu deleted file mode 100644 index 0eb7ddc3..00000000 --- a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu +++ /dev/null @@ -1,57 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/convolution/backward_data/implicit_gemm_deconv_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4" generated by cutlass generator -using Deconvolution = - typename cutlass::conv::device::Deconvolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorKxRSCx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 128, 16>, - cutlass::gemm::GemmShape<16, 64, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( - const typename Deconvolution::ElementSrc* d_src, - const typename Deconvolution::ElementFilter* d_filter, - const typename Deconvolution::ElementBias* d_bias, - const typename Deconvolution::ElementDst* d_z, - typename Deconvolution::ElementDst* d_dst, - int* workspace, - typename Deconvolution::ConvolutionParameter const& conv_param, - typename Deconvolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu deleted file mode 100644 index 39a50fc1..00000000 --- a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu +++ /dev/null @@ -1,57 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/convolution/backward_data/implicit_gemm_deconv_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4" generated by cutlass generator -using Deconvolution = - typename cutlass::conv::device::Deconvolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorKxRSCx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, - 2, - 4, - 4, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( - const typename Deconvolution::ElementSrc* d_src, - const typename Deconvolution::ElementFilter* d_filter, - const typename Deconvolution::ElementBias* d_bias, - const typename Deconvolution::ElementDst* d_z, - typename Deconvolution::ElementDst* d_dst, - int* workspace, - typename Deconvolution::ConvolutionParameter const& conv_param, - typename Deconvolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu deleted file mode 100644 index cfcbe8ef..00000000 --- a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu +++ /dev/null @@ -1,57 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/convolution/backward_data/implicit_gemm_deconv_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4" generated by cutlass generator -using Deconvolution = - typename cutlass::conv::device::Deconvolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorKxRSCx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<32, 128, 32>, - cutlass::gemm::GemmShape<32, 64, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( - const typename Deconvolution::ElementSrc* d_src, - const typename Deconvolution::ElementFilter* d_filter, - const typename Deconvolution::ElementBias* d_bias, - const typename Deconvolution::ElementDst* d_z, - typename Deconvolution::ElementDst* d_dst, - int* workspace, - typename Deconvolution::ConvolutionParameter const& conv_param, - typename Deconvolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu deleted file mode 100644 index 1da05f64..00000000 --- a/dnn/src/cuda/convolution/backward_data/int8/kimpl/cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu +++ /dev/null @@ -1,57 +0,0 @@ - -#if !MEGDNN_TEGRA_X1 -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "src/cuda/convolution/backward_data/implicit_gemm_deconv_cutlass_wrapper.cuinl" - - -// kernel instance "cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4" generated by cutlass generator -using Deconvolution = - typename cutlass::conv::device::Deconvolution< - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int8_t, - cutlass::layout::TensorKxRSCx<4>, - int8_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::layout::TensorNCxHWx<4>, - int32_t, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm61, - cutlass::gemm::GemmShape<64, 128, 32>, - cutlass::gemm::GemmShape<64, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - cutlass::epilogue::thread::BiasAddLinearCombinationClamp< - int8_t, - 4, - int32_t, - int32_t, - float - >, - cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, - 2, - 4, - 16, - true, - cutlass::arch::OpMultiplyAdd, - cutlass::conv::ImplicitGemmMode::GEMM_NT>; - - - -template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( - const typename Deconvolution::ElementSrc* d_src, - const typename Deconvolution::ElementFilter* d_filter, - const typename Deconvolution::ElementBias* d_bias, - const typename Deconvolution::ElementDst* d_z, - typename Deconvolution::ElementDst* d_dst, - int* workspace, - typename Deconvolution::ConvolutionParameter const& conv_param, - typename Deconvolution::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream); - - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nn_align1.cu deleted file mode 100644 index 8835d03e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x128_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_128x128_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x128_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x128_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x128_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nt_align1.cu deleted file mode 100644 index d80e3da0..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x128_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_128x128_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x128_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x128_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x128_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tn_align1.cu deleted file mode 100644 index 90615451..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x128_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_128x128_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x128_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x128_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x128_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tt_align1.cu deleted file mode 100644 index e425c9ee..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x128_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x128_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_128x128_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x128_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x128_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x128_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nn_align1.cu deleted file mode 100644 index 7ccda810..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_128x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nt_align1.cu deleted file mode 100644 index 5bbe80ea..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_128x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tn_align1.cu deleted file mode 100644 index 0d90cfc0..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_128x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tt_align1.cu deleted file mode 100644 index 7ac7f3ec..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x32_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_128x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nn_align1.cu deleted file mode 100644 index a8dfa509..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x64_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_128x64_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x64_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x64_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x64_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nt_align1.cu deleted file mode 100644 index f9c45771..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x64_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_128x64_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x64_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x64_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x64_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tn_align1.cu deleted file mode 100644 index 2cfeb57c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x64_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_128x64_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x64_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x64_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x64_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tt_align1.cu deleted file mode 100644 index 09e1459b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_128x64_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_128x64_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_128x64_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_128x64_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_128x64_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_128x64_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_128x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nn_align1.cu deleted file mode 100644 index 3c3f2e16..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x128_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_16x128_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 128, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x128_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x128_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x128_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nt_align1.cu deleted file mode 100644 index 960c221e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x128_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_16x128_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 128, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x128_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x128_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x128_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tn_align1.cu deleted file mode 100644 index b7a7c1ee..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x128_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_16x128_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 128, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x128_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x128_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x128_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tt_align1.cu deleted file mode 100644 index 17273792..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x128_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x128_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_16x128_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 128, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x128_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x128_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x128_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nn_align1.cu deleted file mode 100644 index 0a0119f2..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_16x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nt_align1.cu deleted file mode 100644 index 37c8eaa6..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_16x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tn_align1.cu deleted file mode 100644 index 9bc2438d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_16x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tt_align1.cu deleted file mode 100644 index fe2c5482..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x32_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_16x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nn_align1.cu deleted file mode 100644 index 7a3efade..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x64_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_16x64_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x64_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x64_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x64_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nt_align1.cu deleted file mode 100644 index 53e32ead..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x64_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_16x64_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x64_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x64_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x64_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tn_align1.cu deleted file mode 100644 index cf9dd810..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x64_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_16x64_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x64_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x64_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x64_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tt_align1.cu deleted file mode 100644 index 53b44096..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_16x64_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_16x64_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_16x64_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_16x64_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_16x64_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_16x64_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_16x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nn_align1.cu deleted file mode 100644 index d88caa76..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_256x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_256x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 32, 8>, - cutlass::gemm::GemmShape<64, 16, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_256x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_256x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_256x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_256x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nt_align1.cu deleted file mode 100644 index 28652ab9..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_256x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_256x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 32, 8>, - cutlass::gemm::GemmShape<64, 16, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_256x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_256x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_256x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_256x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tn_align1.cu deleted file mode 100644 index 04738c31..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_256x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_256x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 32, 8>, - cutlass::gemm::GemmShape<64, 16, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_256x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_256x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_256x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_256x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tt_align1.cu deleted file mode 100644 index fe4c6356..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x32_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_256x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_256x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 32, 8>, - cutlass::gemm::GemmShape<64, 16, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_256x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_256x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_256x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_256x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nn_align1.cu deleted file mode 100644 index 4f43a3cc..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_256x64_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_256x64_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_256x64_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_256x64_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_256x64_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_256x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nt_align1.cu deleted file mode 100644 index ba1354c9..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_256x64_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_256x64_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_256x64_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_256x64_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_256x64_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_256x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tn_align1.cu deleted file mode 100644 index 70fbb154..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_256x64_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_256x64_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_256x64_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_256x64_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_256x64_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_256x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tt_align1.cu deleted file mode 100644 index 81481800..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_256x64_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_256x64_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_256x64_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_256x64_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_256x64_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_256x64_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_256x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nn_align1.cu deleted file mode 100644 index a98ac691..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x128_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_32x128_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x128_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x128_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x128_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nt_align1.cu deleted file mode 100644 index 69004b21..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x128_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_32x128_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x128_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x128_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x128_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tn_align1.cu deleted file mode 100644 index 28926368..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x128_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_32x128_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x128_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x128_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x128_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tt_align1.cu deleted file mode 100644 index f9757db1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x128_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x128_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_32x128_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x128_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x128_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x128_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nn_align1.cu deleted file mode 100644 index 56af657e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x256_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_32x256_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 256, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x256_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x256_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x256_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x256_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nt_align1.cu deleted file mode 100644 index 46110113..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x256_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_32x256_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 256, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x256_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x256_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x256_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x256_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tn_align1.cu deleted file mode 100644 index 2fde5da8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x256_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_32x256_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 256, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x256_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x256_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x256_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x256_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tt_align1.cu deleted file mode 100644 index 095eeaaf..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x256_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x256_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_32x256_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 256, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x256_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x256_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x256_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x256_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nn_align1.cu deleted file mode 100644 index a6cd1554..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_32x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nt_align1.cu deleted file mode 100644 index 334ee1d8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_32x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tn_align1.cu deleted file mode 100644 index 572103fd..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_32x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tt_align1.cu deleted file mode 100644 index ca174a9a..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x32_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_32x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nn_align1.cu deleted file mode 100644 index 42d2171a..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x64_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_32x64_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x64_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x64_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x64_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nt_align1.cu deleted file mode 100644 index 7efcd307..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x64_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_32x64_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x64_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x64_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x64_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tn_align1.cu deleted file mode 100644 index faa46fb3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x64_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_32x64_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x64_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x64_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x64_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tt_align1.cu deleted file mode 100644 index b92ea109..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_32x64_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_32x64_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_32x64_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_32x64_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_32x64_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_32x64_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_32x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nn_align1.cu deleted file mode 100644 index c894ca7c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x128_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_64x128_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x128_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x128_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x128_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nt_align1.cu deleted file mode 100644 index 88cfabd3..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x128_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_64x128_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x128_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x128_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x128_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tn_align1.cu deleted file mode 100644 index 68b69eab..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x128_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_64x128_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x128_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x128_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x128_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tt_align1.cu deleted file mode 100644 index b1ff3810..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x128_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x128_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_64x128_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x128_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x128_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x128_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nn_align1.cu deleted file mode 100644 index d967f2f5..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x256_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_64x256_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 256, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x256_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x256_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x256_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x256_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nt_align1.cu deleted file mode 100644 index 0da5a24b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x256_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_64x256_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 256, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x256_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x256_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x256_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x256_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tn_align1.cu deleted file mode 100644 index 5afa52ed..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x256_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_64x256_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 256, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x256_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x256_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x256_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x256_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tt_align1.cu deleted file mode 100644 index 55537c28..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x256_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x256_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_64x256_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 256, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x256_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x256_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x256_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x256_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nn_align1.cu deleted file mode 100644 index 9967f9ac..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_64x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nt_align1.cu deleted file mode 100644 index 8622d27a..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_64x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tn_align1.cu deleted file mode 100644 index 64fd370f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_64x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tt_align1.cu deleted file mode 100644 index 91c8529c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x32_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_64x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nn_align1.cu deleted file mode 100644 index 552e3070..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x64_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_64x64_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x64_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x64_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x64_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nt_align1.cu deleted file mode 100644 index af861cdd..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x64_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_64x64_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x64_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x64_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x64_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tn_align1.cu deleted file mode 100644 index 8a51ef22..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x64_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_64x64_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x64_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x64_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x64_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tt_align1.cu deleted file mode 100644 index a17672ef..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_64x64_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_64x64_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_64x64_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_64x64_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_64x64_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_64x64_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_64x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nn_align1.cu deleted file mode 100644 index c4b8ac1d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_8x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_8x32_8x2_nn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_8x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_8x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_8x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_8x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nt_align1.cu deleted file mode 100644 index d2d55fdf..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_nt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_8x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_8x32_8x2_nt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_8x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_8x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_8x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_8x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tn_align1.cu deleted file mode 100644 index f311d75c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tn_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_8x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_8x32_8x2_tn_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_8x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_8x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_8x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_8x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tt_align1.cu deleted file mode 100644 index 00c1c9c9..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_8x32_8x2_tt_align1.cu +++ /dev/null @@ -1,49 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_8x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_8x32_8x2_tt_align1 = cutlass::gemm::device::Gemm< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, - 2, - 1, - 1, - false, - cutlass::arch::OpMultiplyAdd - - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_8x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_8x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_8x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_8x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu deleted file mode 100644 index 77eb01ca..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu deleted file mode 100644 index f5fa4f0a..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu deleted file mode 100644 index 3ac6a2bd..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu deleted file mode 100644 index abac3e7e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu deleted file mode 100644 index 672d2f43..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu deleted file mode 100644 index 876263f4..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu deleted file mode 100644 index d338c160..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu deleted file mode 100644 index 7f0ab852..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu deleted file mode 100644 index 1d8d9ff1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu deleted file mode 100644 index 01a16c59..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu deleted file mode 100644 index 8a53c72a..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu deleted file mode 100644 index a175bd7d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<128, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu deleted file mode 100644 index 898008db..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 128, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu deleted file mode 100644 index c4dec94b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 128, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu deleted file mode 100644 index ef795a29..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 128, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu deleted file mode 100644 index f8826de6..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 128, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu deleted file mode 100644 index 76338c38..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu deleted file mode 100644 index 291301bd..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu deleted file mode 100644 index 3aec7e52..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu deleted file mode 100644 index 722a3e56..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<16, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu deleted file mode 100644 index 1113e8ae..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu deleted file mode 100644 index d7217f76..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu deleted file mode 100644 index 427216e2..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu deleted file mode 100644 index 4dce8045..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu deleted file mode 100644 index ef2be0d9..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 32, 8>, - cutlass::gemm::GemmShape<64, 16, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu deleted file mode 100644 index fee41926..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 32, 8>, - cutlass::gemm::GemmShape<64, 16, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu deleted file mode 100644 index f0c20a9c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 32, 8>, - cutlass::gemm::GemmShape<64, 16, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu deleted file mode 100644 index af24f798..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 32, 8>, - cutlass::gemm::GemmShape<64, 16, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu deleted file mode 100644 index abb555d9..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu deleted file mode 100644 index 9b74fccd..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu deleted file mode 100644 index 6ea860a7..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu deleted file mode 100644 index d0c41a87..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<256, 64, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu deleted file mode 100644 index 02df0200..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu deleted file mode 100644 index f351dc22..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu deleted file mode 100644 index 07e3e2f0..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu deleted file mode 100644 index e4f10562..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu deleted file mode 100644 index d95f6269..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 256, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu deleted file mode 100644 index 2b5c3f46..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 256, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu deleted file mode 100644 index f9d2760e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 256, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu deleted file mode 100644 index e8acc8d1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 256, 8>, - cutlass::gemm::GemmShape<16, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu deleted file mode 100644 index a87a1c72..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu deleted file mode 100644 index a088516a..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu deleted file mode 100644 index ca317b8b..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu deleted file mode 100644 index 2b90bf0e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<32, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu deleted file mode 100644 index dcff6500..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu deleted file mode 100644 index 8a6a092d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu deleted file mode 100644 index 617de1e6..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu deleted file mode 100644 index dd2d4ba0..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu deleted file mode 100644 index 7eb8fe18..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu deleted file mode 100644 index fd4f4415..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu deleted file mode 100644 index 1b5412a8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu deleted file mode 100644 index d2f6f7a8..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 128, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu deleted file mode 100644 index 7b169008..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 256, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu deleted file mode 100644 index 653bcd12..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 256, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu deleted file mode 100644 index 3e62fcdf..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 256, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu deleted file mode 100644 index ef6b2922..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 256, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu deleted file mode 100644 index 1a112197..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu deleted file mode 100644 index c034e358..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu deleted file mode 100644 index 7138361d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu deleted file mode 100644 index b783dc8f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<64, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu deleted file mode 100644 index 2dc0c571..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu deleted file mode 100644 index 984256e6..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu deleted file mode 100644 index f5b8e9b1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu deleted file mode 100644 index ed003053..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<64, 64, 8>, - cutlass::gemm::GemmShape<32, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu deleted file mode 100644 index 51d3e881..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu deleted file mode 100644 index b0c00d7c..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu deleted file mode 100644 index 7ef5c6f2..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::ColumnMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu b/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu deleted file mode 100644 index 494c8af1..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt/kimpl/cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu +++ /dev/null @@ -1,42 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl" - - - // Gemm operator cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1 - using Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1 = cutlass::gemm::device::GemmSplitKParallel< - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, - cutlass::arch::OpClassSimt, - cutlass::arch::Sm50, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<8, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - cutlass::epilogue::thread::LinearCombination< - float, - 1, - float, - float - > - >; - - -template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper( - const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1::ElementA* d_A, size_t lda, - const typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1::ElementB* d_B, size_t ldb, - typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1::ElementC* d_C, size_t ldc, - int* workspace, - cutlass::gemm::GemmCoord const& problem_size, - typename Operation_cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1::EpilogueOutputOp::Params const& epilogue, - cudaStream_t stream, int split_k_slices); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu deleted file mode 100644 index 796a1849..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4 - using Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 128, 16>, - cutlass::gemm::GemmShape<1, 4, 2>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu deleted file mode 100644 index ed2bfa09..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2 - using Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 128, 16>, - cutlass::gemm::GemmShape<1, 2, 4>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu deleted file mode 100644 index 09c0c11f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1 - using Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 128, 2>, - cutlass::gemm::GemmShape<1, 1, 1>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu deleted file mode 100644 index 09c84e49..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4 - using Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 128, 32>, - cutlass::gemm::GemmShape<1, 4, 4>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu deleted file mode 100644 index 2be9035e..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2 - using Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 128, 4>, - cutlass::gemm::GemmShape<1, 2, 1>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu deleted file mode 100644 index 1c751e12..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1 - using Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 128, 4>, - cutlass::gemm::GemmShape<1, 1, 2>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu deleted file mode 100644 index ece1c4d7..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4 - using Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 128, 8>, - cutlass::gemm::GemmShape<1, 4, 1>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu deleted file mode 100644 index 13031cfb..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2 - using Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 128, 8>, - cutlass::gemm::GemmShape<1, 2, 2>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu deleted file mode 100644 index 4dda125f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1 - using Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 128, 8>, - cutlass::gemm::GemmShape<1, 1, 4>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu deleted file mode 100644 index d0653427..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4 - using Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 32, 128>, - cutlass::gemm::GemmShape<1, 4, 4>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu deleted file mode 100644 index 4cfc6d63..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2 - using Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 32, 16>, - cutlass::gemm::GemmShape<1, 2, 1>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu deleted file mode 100644 index 41b3a65d..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1 - using Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 32, 16>, - cutlass::gemm::GemmShape<1, 1, 2>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu deleted file mode 100644 index afe747f9..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4 - using Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 32, 32>, - cutlass::gemm::GemmShape<1, 4, 1>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu deleted file mode 100644 index f9b65472..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2 - using Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 32, 32>, - cutlass::gemm::GemmShape<1, 2, 2>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu deleted file mode 100644 index 7b0f8c1f..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1 - using Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 32, 32>, - cutlass::gemm::GemmShape<1, 1, 4>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu deleted file mode 100644 index 7fd28ef7..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4 - using Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 32, 64>, - cutlass::gemm::GemmShape<1, 4, 2>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu deleted file mode 100644 index b61d3b55..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2 - using Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 32, 64>, - cutlass::gemm::GemmShape<1, 2, 4>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu deleted file mode 100644 index b64ee3be..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1 - using Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 32, 8>, - cutlass::gemm::GemmShape<1, 1, 1>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu deleted file mode 100644 index efc6f3dd..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4 - using Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 64, 16>, - cutlass::gemm::GemmShape<1, 4, 1>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu deleted file mode 100644 index 38eef285..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2 - using Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 64, 16>, - cutlass::gemm::GemmShape<1, 2, 2>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu deleted file mode 100644 index 9db9b56a..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1 - using Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 64, 16>, - cutlass::gemm::GemmShape<1, 1, 4>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu deleted file mode 100644 index b3979fad..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4 - using Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 64, 32>, - cutlass::gemm::GemmShape<1, 4, 2>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu deleted file mode 100644 index 29f0f783..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2 - using Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 64, 32>, - cutlass::gemm::GemmShape<1, 2, 4>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu deleted file mode 100644 index 0814d571..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1 - using Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 64, 4>, - cutlass::gemm::GemmShape<1, 1, 1>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu deleted file mode 100644 index 53567cdf..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4 - using Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 64, 64>, - cutlass::gemm::GemmShape<1, 4, 4>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu deleted file mode 100644 index ab26e0fd..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2 - using Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 64, 8>, - cutlass::gemm::GemmShape<1, 2, 1>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif diff --git a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu b/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu deleted file mode 100644 index 9321d8fc..00000000 --- a/dnn/src/cuda/matrix_mul/fp32_simt_gemv/kimpl/cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu +++ /dev/null @@ -1,31 +0,0 @@ - -#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2) -// ignore warning of cutlass -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wuninitialized" -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl" - - - // Gemm operator cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1 - using Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1 = cutlass::gemm::kernel::DefaultGemv< - cutlass::gemm::GemmShape<1, 64, 8>, - cutlass::gemm::GemmShape<1, 1, 2>, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor, - float, cutlass::layout::RowMajor - >; - - -template void megdnn::cuda::cutlass_wrapper:: - cutlass_vector_matrix_mul_batched_strided_wrapper( - BatchedGemmCoord const& problem_size, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1::ElementA* d_A, size_t lda, size_t batch_stride_a, - const typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1::ElementB* d_B, size_t ldb, size_t batch_stride_b, - typename Operation_cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1::ElementCD* d_C, size_t ldc, size_t batch_stride_c, - cudaStream_t stream); - -#pragma GCC diagnostic pop -#endif