GitOrigin-RevId: da3bcfb85a
release-1.5
@@ -1,5 +1,6 @@ | |||||
# Mark generated files as binary, ignore them in git diff. | # Mark generated files as binary, ignore them in git diff. | ||||
# dnn | # dnn | ||||
dnn/scripts/cutlass_generator/list.bzl binary | |||||
dnn/src/cuda/conv_bias/int4/kimpl/* binary | dnn/src/cuda/conv_bias/int4/kimpl/* binary | ||||
dnn/src/cuda/conv_bias/int8/kimpl/* binary | dnn/src/cuda/conv_bias/int8/kimpl/* binary | ||||
dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary | dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary | ||||
@@ -0,0 +1,18 @@ | |||||
load("list.bzl", "cutlass_gen_list") | |||||
genrule( | |||||
name = "cutlass_kimpls", | |||||
outs = cutlass_gen_list, | |||||
cmd = """GEN=$(location //brain/megbrain/dnn/scripts/cutlass_generator:generator.py) | |||||
pwd > /tmp/a | |||||
echo $(@D) > /tmp/b | |||||
python3 $$GEN --operations gemm --type simt $(@D) | |||||
python3 $$GEN --operations gemv --type simt $(@D) | |||||
python3 $$GEN --operations deconv --type simt $(@D) | |||||
python3 $$GEN --operations conv2d --type simt $(@D) | |||||
python3 $$GEN --operations conv2d --type tensorop8816 $(@D) | |||||
python3 $$GEN --operations conv2d --type tensorop8832 $(@D) | |||||
""", | |||||
tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"], | |||||
visibility = ["//visibility:public"], | |||||
) |
@@ -0,0 +1,19 @@ | |||||
# Generate device kernel registration code for CUTLASS kernels | |||||
## Usage | |||||
```bash | |||||
python3 generator.py [--operations {gemm, gemv, conv2d, deconv}] [--type {simt, tensorop8816, tensorop8832}] | |||||
output | |||||
``` | |||||
- operations: operation kind, including gemm|gemv|conv2d|deconv | |||||
- type: opcode class, simt|tensorop8816|tensorop8832 | |||||
- output: the output directory for CUTLASS kernels | |||||
## Generate file list for bazel | |||||
We generate `list.bzl` because the `genrule` method of bazel requires that the output file list be specified in the analysis phase. | |||||
Please call `gen_list.py` when new operations are added. | |||||
```bash | |||||
python3 gen_list.py | |||||
``` |
@@ -0,0 +1,614 @@ | |||||
# | |||||
# \file generator.py | |||||
# | |||||
# \brief Generates the CUTLASS Library's instances | |||||
# | |||||
# | |||||
import enum | |||||
import os.path | |||||
import shutil | |||||
from typing import Tuple, List | |||||
from lazy_file import LazyFile | |||||
from library import * | |||||
################################################################################################### | |||||
# | |||||
class Conv2dOperation: | |||||
# | |||||
def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ | |||||
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ | |||||
need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNt): | |||||
self.operation_kind = OperationKind.Conv2d | |||||
self.conv_kind = conv_kind | |||||
self.arch = arch | |||||
self.tile_description = tile_description | |||||
self.conv_type = conv_type | |||||
self.src = src | |||||
self.flt = flt | |||||
self.bias = bias | |||||
self.dst = dst | |||||
self.element_epilogue = element_epilogue | |||||
self.epilogue_functor = epilogue_functor | |||||
self.swizzling_functor = swizzling_functor | |||||
self.need_load_from_const = need_load_from_const | |||||
self.implicit_gemm_mode = implicit_gemm_mode | |||||
# | |||||
def accumulator_type(self): | |||||
accum = self.tile_description.math_instruction.element_accumulator | |||||
return accum | |||||
# | |||||
def core_name(self): | |||||
''' The basic operation kind is prefixed with a letter indicating the accumulation type. ''' | |||||
intermediate_type = '' | |||||
if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp: | |||||
inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape) | |||||
if self.tile_description.math_instruction.element_a != self.flt.element and \ | |||||
self.tile_description.math_instruction.element_a != self.accumulator_type(): | |||||
intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a] | |||||
else: | |||||
inst_shape = '' | |||||
unity_kernel = '' | |||||
if not self.need_load_from_const: | |||||
unity_kernel = '_1x1' | |||||
return "%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \ | |||||
inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \ | |||||
ShortEpilogueNames[self.epilogue_functor]) | |||||
# | |||||
def extended_name(self): | |||||
if self.dst.element != self.tile_description.math_instruction.element_accumulator: | |||||
if self.src.element != self.flt.element: | |||||
extended_name = "${element_dst}_${core_name}_${element_src}_${element_flt}" | |||||
elif self.src.element == self.flt.element: | |||||
extended_name = "${element_dst}_${core_name}_${element_src}" | |||||
else: | |||||
if self.src.element != self.flt.element: | |||||
extended_name = "${core_name}_${element_src}_${element_flt}" | |||||
elif self.src.element == self.flt.element: | |||||
extended_name = "${core_name}_${element_src}" | |||||
extended_name = SubstituteTemplate(extended_name, { | |||||
'element_src': DataTypeNames[self.src.element], | |||||
'element_flt': DataTypeNames[self.flt.element], | |||||
'element_dst': DataTypeNames[self.dst.element], | |||||
'core_name': self.core_name() | |||||
}) | |||||
return extended_name | |||||
# | |||||
def layout_name(self): | |||||
if self.src.layout == self.dst.layout: | |||||
layout_name = "${src_layout}_${flt_layout}" | |||||
else: | |||||
layout_name = "${src_layout}_${flt_layout}_${dst_layout}" | |||||
layout_name = SubstituteTemplate(layout_name, { | |||||
'src_layout': ShortLayoutTypeNames[self.src.layout], | |||||
'flt_layout': ShortLayoutTypeNames[self.flt.layout], | |||||
'dst_layout': ShortLayoutTypeNames[self.dst.layout], | |||||
}) | |||||
return layout_name | |||||
# | |||||
def configuration_name(self): | |||||
''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' | |||||
opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class] | |||||
warp_shape = [int(self.tile_description.threadblock_shape[idx] / self.tile_description.warp_count[idx]) for idx in range(3)] | |||||
threadblock = "%dx%dx%d_%dx%dx%d_%d" % ( | |||||
self.tile_description.threadblock_shape[0], | |||||
self.tile_description.threadblock_shape[1], | |||||
self.tile_description.threadblock_shape[2], | |||||
warp_shape[0], | |||||
warp_shape[1], | |||||
warp_shape[2], | |||||
self.tile_description.stages, | |||||
) | |||||
configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}" | |||||
return SubstituteTemplate( | |||||
configuration_name, | |||||
{ | |||||
'opcode_class': opcode_class_name, | |||||
'extended_name': self.extended_name(), | |||||
'threadblock': threadblock, | |||||
'layout': self.layout_name(), | |||||
} | |||||
) | |||||
# | |||||
def procedural_name(self): | |||||
''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' | |||||
return self.configuration_name() | |||||
################################################################################################### | |||||
# | |||||
# Emits single instances of a CUTLASS device-wide operator | |||||
# | |||||
################################################################################################### | |||||
class EmitConv2dInstance: | |||||
def __init__(self): | |||||
self.template = """ | |||||
// kernel instance "${operation_name}" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
${element_src}, | |||||
${layout_src}, | |||||
${element_flt}, | |||||
${layout_flt}, | |||||
${element_dst}, | |||||
${layout_dst}, | |||||
${element_bias}, | |||||
${layout_bias}, | |||||
${element_accumulator}, | |||||
${conv_type}, | |||||
${opcode_class}, | |||||
${arch}, | |||||
cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, | |||||
cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, | |||||
cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, | |||||
${epilogue_functor}< | |||||
${element_dst}, | |||||
${epilogue_vector_length}, | |||||
${element_accumulator}, | |||||
${element_bias}, | |||||
${element_epilogue} | |||||
>, | |||||
${swizzling_functor}, | |||||
${stages}, | |||||
${alignment_src}, | |||||
${alignment_filter}, | |||||
${nonuninity_kernel}, | |||||
${math_operator}, | |||||
${implicit_gemm_mode}>; | |||||
""" | |||||
def emit(self, operation): | |||||
warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)] | |||||
epilogue_vector_length = int(min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) / DataTypeSize[operation.dst.element]) | |||||
values = { | |||||
'operation_name': operation.procedural_name(), | |||||
'conv_type': ConvTypeTag[operation.conv_type], | |||||
'element_src': DataTypeTag[operation.src.element], | |||||
'layout_src': LayoutTag[operation.src.layout], | |||||
'element_flt': DataTypeTag[operation.flt.element], | |||||
'layout_flt': LayoutTag[operation.flt.layout], | |||||
'element_dst': DataTypeTag[operation.dst.element], | |||||
'layout_dst': LayoutTag[operation.dst.layout], | |||||
'element_bias': DataTypeTag[operation.bias.element], | |||||
'layout_bias': LayoutTag[operation.bias.layout], | |||||
'element_accumulator': DataTypeTag[operation.accumulator_type()], | |||||
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], | |||||
'arch': "cutlass::arch::Sm%d" % operation.arch, | |||||
'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), | |||||
'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), | |||||
'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), | |||||
'warp_shape_m': str(warp_shape[0]), | |||||
'warp_shape_n': str(warp_shape[1]), | |||||
'warp_shape_k': str(warp_shape[2]), | |||||
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), | |||||
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), | |||||
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), | |||||
'epilogue_vector_length': str(epilogue_vector_length), | |||||
'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], | |||||
'element_epilogue': str(DataTypeTag[operation.element_epilogue]), | |||||
'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], | |||||
'stages': str(operation.tile_description.stages), | |||||
'alignment_src': str(operation.src.alignment), | |||||
'alignment_filter': str(operation.flt.alignment), | |||||
'nonuninity_kernel': str(operation.need_load_from_const).lower(), | |||||
'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], | |||||
'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] | |||||
} | |||||
return SubstituteTemplate(self.template, values) | |||||
class EmitDeconvInstance: | |||||
def __init__(self): | |||||
self.template = """ | |||||
// kernel instance "${operation_name}" generated by cutlass generator | |||||
using Deconvolution = | |||||
typename cutlass::conv::device::Deconvolution< | |||||
${element_src}, | |||||
${layout_src}, | |||||
${element_flt}, | |||||
${layout_flt}, | |||||
${element_dst}, | |||||
${layout_dst}, | |||||
${element_bias}, | |||||
${layout_bias}, | |||||
${element_accumulator}, | |||||
${opcode_class}, | |||||
${arch}, | |||||
cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, | |||||
cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, | |||||
cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, | |||||
${epilogue_functor}< | |||||
${element_dst}, | |||||
${epilogue_vector_length}, | |||||
${element_accumulator}, | |||||
${element_bias}, | |||||
${element_epilogue} | |||||
>, | |||||
${swizzling_functor}, | |||||
${stages}, | |||||
${alignment_src}, | |||||
${alignment_filter}, | |||||
${nonuninity_kernel}, | |||||
${math_operator}, | |||||
${implicit_gemm_mode}>; | |||||
""" | |||||
def emit(self, operation): | |||||
warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)] | |||||
epilogue_vector_length = int(min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) / DataTypeSize[operation.dst.element]) | |||||
values = { | |||||
'operation_name': operation.procedural_name(), | |||||
'element_src': DataTypeTag[operation.src.element], | |||||
'layout_src': LayoutTag[operation.src.layout], | |||||
'element_flt': DataTypeTag[operation.flt.element], | |||||
'layout_flt': LayoutTag[operation.flt.layout], | |||||
'element_dst': DataTypeTag[operation.dst.element], | |||||
'layout_dst': LayoutTag[operation.dst.layout], | |||||
'element_bias': DataTypeTag[operation.bias.element], | |||||
'layout_bias': LayoutTag[operation.bias.layout], | |||||
'element_accumulator': DataTypeTag[operation.accumulator_type()], | |||||
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], | |||||
'arch': "cutlass::arch::Sm%d" % operation.arch, | |||||
'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), | |||||
'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), | |||||
'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), | |||||
'warp_shape_m': str(warp_shape[0]), | |||||
'warp_shape_n': str(warp_shape[1]), | |||||
'warp_shape_k': str(warp_shape[2]), | |||||
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), | |||||
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), | |||||
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), | |||||
'epilogue_vector_length': str(epilogue_vector_length), | |||||
'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], | |||||
'element_epilogue': str(DataTypeTag[operation.element_epilogue]), | |||||
'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], | |||||
'stages': str(operation.tile_description.stages), | |||||
'alignment_src': str(operation.src.alignment), | |||||
'alignment_filter': str(operation.flt.alignment), | |||||
'nonuninity_kernel': str(operation.need_load_from_const).lower(), | |||||
'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], | |||||
'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] | |||||
} | |||||
return SubstituteTemplate(self.template, values) | |||||
################################################################################################### | |||||
# | |||||
# Generator functions for all layouts | |||||
# | |||||
################################################################################################### | |||||
# | |||||
def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ | |||||
skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNt): | |||||
operations = [] | |||||
element_epilogue = DataType.f32 | |||||
if conv_kind == ConvKind.Fprop: | |||||
if src_layout == LayoutType.TensorNHWC: | |||||
swizzling_functor = SwizzlingFunctor.ConvFpropNHWC | |||||
else: | |||||
swizzling_functor = SwizzlingFunctor.ConvFpropNCxHWx | |||||
else: | |||||
swizzling_functor = SwizzlingFunctor.ConvDgradNCxHWx | |||||
# skip rule | |||||
def filter_tile_with_layout(tile: TileDescription, layout: LayoutType) -> bool: | |||||
return layout == LayoutType.TensorNC32HW32 and \ | |||||
tile.threadblock_shape[0] % 32 != 0 | |||||
# rule for bias_type and epilogues | |||||
def get_bias_type_and_epilogues(tile: TileDescription, \ | |||||
out_dtype: DataType) -> Tuple[DataType, List[EpilogueFunctor]]: | |||||
if tile.math_instruction.element_accumulator == DataType.s32 and \ | |||||
out_dtype != DataType.f32: | |||||
bias_type = DataType.s32 | |||||
if tile.math_instruction.element_b == DataType.u4: | |||||
epilogues = [EpilogueFunctor.BiasAddLinearCombinationClamp, EpilogueFunctor.BiasAddLinearCombinationReluClamp] | |||||
else: | |||||
epilogues = [EpilogueFunctor.BiasAddLinearCombinationClamp, EpilogueFunctor.BiasAddLinearCombinationReluClamp, \ | |||||
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp] | |||||
elif tile.math_instruction.element_accumulator == DataType.f32 or \ | |||||
out_dtype == DataType.f32: | |||||
bias_type = DataType.f32 | |||||
epilogues = [EpilogueFunctor.BiasAddLinearCombination, EpilogueFunctor.BiasAddLinearCombinationRelu, \ | |||||
EpilogueFunctor.BiasAddLinearCombinationHSwish] | |||||
return bias_type, epilogues | |||||
# rule for filter alignment | |||||
def get_flt_align(tile: TileDescription) -> int: | |||||
nonlocal flt_align | |||||
if tile.math_instruction.opcode_class == OpcodeClass.Simt \ | |||||
and tile.math_instruction.element_accumulator == DataType.s32: | |||||
thread_num = tile.warp_count[0] * tile.warp_count[1] * tile.warp_count[2] * 32 | |||||
flt_block = tile.threadblock_shape[0] * tile.threadblock_shape[2] \ | |||||
* DataTypeSize[tile.math_instruction.element_a] | |||||
load_per_thread = flt_block//thread_num | |||||
if load_per_thread >= 128: | |||||
flt_align = 128 | |||||
elif load_per_thread >= 64: | |||||
flt_align = 64 | |||||
else: | |||||
assert load_per_thread >= 32 | |||||
flt_align = 32 | |||||
return flt_align | |||||
def get_dst_align(tile: TileDescription, out_layout: LayoutType) -> int: | |||||
nonlocal dst_align | |||||
if tile.math_instruction.opcode_class == OpcodeClass.TensorOp \ | |||||
and dst_layout == LayoutType.TensorNC4HW4: | |||||
dst_align = 32 | |||||
return dst_align | |||||
def filter_epilogue_with_conv_kind(epilogue: EpilogueFunctor, conv_kind: ConvKind) -> bool: | |||||
return conv_kind == ConvKind.Dgrad \ | |||||
and epilogue != EpilogueFunctor.BiasAddLinearCombinationClamp | |||||
# loop over all tile descriptions | |||||
for tile in tile_descriptions: | |||||
if filter_tile_with_layout(tile, dst_layout): | |||||
continue | |||||
bias_type, epilogues = get_bias_type_and_epilogues(tile, dst_type) | |||||
flt_align = get_flt_align(tile) | |||||
dst_align = get_dst_align(tile, dst_layout) | |||||
for epilogue in epilogues: | |||||
if filter_epilogue_with_conv_kind(epilogue, conv_kind): | |||||
continue | |||||
if dst_type == DataType.f32: | |||||
bias_type = DataType.f32 | |||||
# | |||||
src = TensorDescription(tile.math_instruction.element_b, src_layout, int(src_align / DataTypeSize[tile.math_instruction.element_b])) | |||||
flt = TensorDescription(tile.math_instruction.element_a, flt_layout, int(flt_align / DataTypeSize[tile.math_instruction.element_a])) | |||||
bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) | |||||
dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) | |||||
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode) | |||||
operations.append(new_operation) | |||||
if not skip_unity_kernel: | |||||
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode) | |||||
operations.append(new_operation) | |||||
return operations | |||||
################################################################################################### | |||||
# | |||||
# Emitters functions for all targets | |||||
# | |||||
################################################################################################### | |||||
class EmitConv2dConfigurationLibrary: | |||||
def __init__(self, operation_path, configuration_name): | |||||
self.configuration_name = configuration_name | |||||
self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name) | |||||
self.instance_emitter = EmitConv2dInstance() | |||||
self.instance_template = """ | |||||
${operation_instance} | |||||
// Derived class | |||||
struct ${operation_name} : | |||||
public ${operation_name}_base { }; | |||||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
""" | |||||
self.header_template = """ | |||||
/* | |||||
Generated by conv2d_operation.py - Do not edit. | |||||
*/ | |||||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
#include "cutlass/cutlass.h" | |||||
#include "cutlass/library/library.h" | |||||
#include "cutlass/library/manifest.h" | |||||
#include "library_internal.h" | |||||
#include "conv2d_operation.h" | |||||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
""" | |||||
self.configuration_header = """ | |||||
namespace cutlass { | |||||
namespace library { | |||||
// Initialize all instances | |||||
void initialize_${configuration_name}(Manifest &manifest) { | |||||
""" | |||||
self.configuration_instance = """ | |||||
using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution< | |||||
${operation_name}>; | |||||
manifest.append(new cutlass::library::Conv2dOperation< | |||||
Operation_${operation_name}>( | |||||
"${operation_name}")); | |||||
""" | |||||
self.configuration_epilogue = """ | |||||
} | |||||
""" | |||||
self.epilogue_template = """ | |||||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
} // namespace library | |||||
} // namespace cutlass | |||||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
""" | |||||
# | |||||
def __enter__(self): | |||||
self.configuration_file = open(self.configuration_path, "w") | |||||
self.configuration_file.write(SubstituteTemplate(self.header_template, { | |||||
'configuration_name': self.configuration_name | |||||
})) | |||||
self.operations = [] | |||||
return self | |||||
# | |||||
def emit(self, operation): | |||||
self.operations.append(operation) | |||||
self.configuration_file.write(SubstituteTemplate(self.instance_template, { | |||||
'configuration_name': self.configuration_name, | |||||
'operation_name': operation.procedural_name(), | |||||
'operation_instance': self.instance_emitter.emit(operation) | |||||
})) | |||||
# | |||||
def __exit__(self, exception_type, exception_value, traceback): | |||||
self.configuration_file.write(SubstituteTemplate(self.configuration_header, { | |||||
'configuration_name': self.configuration_name | |||||
})) | |||||
for operation in self.operations: | |||||
self.configuration_file.write(SubstituteTemplate(self.configuration_instance, { | |||||
'configuration_name': self.configuration_name, | |||||
'operation_name': operation.procedural_name() | |||||
})) | |||||
self.configuration_file.write(self.configuration_epilogue) | |||||
self.configuration_file.write(self.epilogue_template) | |||||
self.configuration_file.close() | |||||
################################################################################################### | |||||
################################################################################################### | |||||
# Emitters for Conv Kernel Wrapper | |||||
# | |||||
################################################################################################### | |||||
class EmitConvSingleKernelWrapper(): | |||||
def __init__(self, kernel_path, operation, wrapper_path): | |||||
self.kernel_path = kernel_path | |||||
self.wrapper_path = wrapper_path | |||||
self.operation = operation | |||||
self.conv_wrappers = { \ | |||||
ConvKind.Fprop: """ | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
""", \ | |||||
ConvKind.Dgrad: """ | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper<Deconvolution>( | |||||
const typename Deconvolution::ElementSrc* d_src, | |||||
const typename Deconvolution::ElementFilter* d_filter, | |||||
const typename Deconvolution::ElementBias* d_bias, | |||||
const typename Deconvolution::ElementDst* d_z, | |||||
typename Deconvolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Deconvolution::ConvolutionParameter const& conv_param, | |||||
typename Deconvolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
""", \ | |||||
} | |||||
if self.operation.conv_kind == ConvKind.Fprop: | |||||
self.instance_emitter = EmitConv2dInstance() | |||||
else: | |||||
assert self.operation.conv_kind == ConvKind.Dgrad | |||||
self.instance_emitter = EmitDeconvInstance() | |||||
self.header_template = """ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "${wrapper_path}" | |||||
""" | |||||
self.instance_template = """ | |||||
${operation_instance} | |||||
""" | |||||
self.wrapper_template = """ | |||||
${wrapper_instance} | |||||
""" | |||||
self.epilogue_template = """ | |||||
#pragma GCC diagnostic pop | |||||
#endif | |||||
""" | |||||
# | |||||
def __enter__(self): | |||||
self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) | |||||
self.kernel_file = LazyFile(self.kernel_path) | |||||
self.kernel_file.write(SubstituteTemplate(self.header_template, { | |||||
'wrapper_path': self.wrapper_path, | |||||
})) | |||||
return self | |||||
# | |||||
def emit(self): | |||||
self.kernel_file.write(SubstituteTemplate(self.instance_template, { | |||||
'operation_instance': self.instance_emitter.emit(self.operation), | |||||
})) | |||||
# emit wrapper | |||||
wrapper = SubstituteTemplate(self.wrapper_template, { | |||||
'wrapper_instance': self.conv_wrappers[self.operation.conv_kind], | |||||
}) | |||||
self.kernel_file.write(wrapper) | |||||
# | |||||
def __exit__(self, exception_type, exception_value, traceback): | |||||
self.kernel_file.write(self.epilogue_template) | |||||
self.kernel_file.close() | |||||
################################################################################################### | |||||
################################################################################################### | |||||
@@ -0,0 +1,38 @@ | |||||
from generator import ( | |||||
GenerateGemmOperations, | |||||
GenerateGemvOperations, | |||||
GenerateConv2dOperations, | |||||
GenerateDeconvOperations, | |||||
) | |||||
class GenArg: | |||||
def __init__(self, gen_op, gen_type): | |||||
self.operations = gen_op | |||||
self.type = gen_type | |||||
def write_op_list(f, gen_op, gen_type): | |||||
if gen_op == "gemm": | |||||
operations = GenerateGemmOperations(GenArg(gen_op, gen_type)) | |||||
elif gen_op == "gemv": | |||||
operations = GenerateGemvOperations(GenArg(gen_op, gen_type)) | |||||
elif gen_op == "conv2d": | |||||
operations = GenerateConv2dOperations(GenArg(gen_op, gen_type)) | |||||
elif gen_op == "deconv": | |||||
operations = GenerateDeconvOperations(GenArg(gen_op, gen_type)) | |||||
for op in operations: | |||||
f.write(' "%s.cu",\n' % op.procedural_name()) | |||||
if __name__ == "__main__": | |||||
with open("list.bzl", "w") as f: | |||||
f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n") | |||||
f.write("cutlass_gen_list = [\n") | |||||
write_op_list(f, "gemm", "simt") | |||||
write_op_list(f, "gemv", "simt") | |||||
write_op_list(f, "deconv", "simt") | |||||
write_op_list(f, "conv2d", "simt") | |||||
write_op_list(f, "conv2d", "tensorop8816") | |||||
write_op_list(f, "conv2d", "tensorop8832") | |||||
f.write("]") |
@@ -0,0 +1,651 @@ | |||||
# | |||||
# \file generator.py | |||||
# | |||||
# \brief Generates the CUTLASS Library's instances | |||||
# | |||||
import enum | |||||
import os.path | |||||
import shutil | |||||
import argparse | |||||
from library import * | |||||
from manifest import * | |||||
################################################################################################### | |||||
# | |||||
def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0): | |||||
# by default, use the latest CUDA Toolkit version | |||||
cuda_version = [11, 0, 132] | |||||
# Update cuda_version based on parsed string | |||||
if semantic_ver_string != '': | |||||
for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')]): | |||||
if i < len(cuda_version): | |||||
cuda_version[i] = x | |||||
else: | |||||
cuda_version.append(x) | |||||
return cuda_version >= [major, minor, patch] | |||||
################################################################################################### | |||||
################################################################################################### | |||||
# | |||||
def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \ | |||||
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \ | |||||
swizzling_functor = SwizzlingFunctor.Identity8): | |||||
if complex_transforms is None: | |||||
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),] | |||||
element_a, element_b, element_c, element_epilogue = data_type | |||||
operations = [] | |||||
# by default, only generate the largest tile and largest alignment | |||||
if manifest.args.kernels == '': | |||||
tile_descriptions = [tile_descriptions[0],] | |||||
alignment_constraints = [alignment_constraints[0],] | |||||
for layout in layouts: | |||||
for tile_description in tile_descriptions: | |||||
for alignment in alignment_constraints: | |||||
for complex_transform in complex_transforms: | |||||
alignment_c = min(8, alignment) | |||||
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0]) | |||||
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1]) | |||||
C = TensorDescription(element_c, layout[2], alignment_c) | |||||
new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \ | |||||
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor) | |||||
manifest.append(new_operation) | |||||
operations.append(new_operation) | |||||
return operations | |||||
########################################################################################################### | |||||
# ConvolutionOperator support variations | |||||
# ____________________________________________________________________ | |||||
# ConvolutionalOperator | Analytic | Optimized | |||||
# ____________________________________________________________________ | |||||
# | Fprop | (strided) | (strided) | |||||
# | Dgrad | (strided, unity*) | (unity) | |||||
# | Wgrad | (strided) | (strided) | |||||
# ____________________________________________________________________ | |||||
# | |||||
# Note : Operator marked (*) are supported but not generated to keep the instantiated kernel count low | |||||
########################################################################################################### | |||||
# Convolution for 2D operations | |||||
def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment, \ | |||||
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination): | |||||
element_a, element_b, element_c, element_epilogue = data_type | |||||
# one exceptional case | |||||
alignment_c = min(8, alignment) | |||||
# iterator algorithm (analytic and optimized) | |||||
iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized] | |||||
# by default, only generate the largest tile size | |||||
if manifest.args.kernels == '': | |||||
tile_descriptions = [tile_descriptions[0],] | |||||
operations = [] | |||||
for tile in tile_descriptions: | |||||
for conv_kind in conv_kinds: | |||||
for iterator_algorithm in iterator_algorithms: | |||||
A = TensorDescription(element_a, layout[0], alignment) | |||||
B = TensorDescription(element_b, layout[1], alignment) | |||||
C = TensorDescription(element_c, layout[2], alignment_c) | |||||
# unity stride only for Optimized Dgrad | |||||
if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad): | |||||
new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ | |||||
A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor) | |||||
manifest.append(new_operation) | |||||
operations.append(new_operation) | |||||
# strided dgrad is not supported by Optimized Dgrad | |||||
if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad): | |||||
continue | |||||
# strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic) | |||||
new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ | |||||
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor) | |||||
manifest.append(new_operation) | |||||
operations.append(new_operation) | |||||
return operations | |||||
################################################################################################### | |||||
################################################################################################### | |||||
def GenerateConv2d_Simt(args): | |||||
operations = [] | |||||
layouts = [ | |||||
(LayoutType.TensorNC4HW4, LayoutType.TensorC4RSK4), | |||||
] | |||||
math_instructions = [ | |||||
MathInstruction( \ | |||||
[1, 1, 4], \ | |||||
DataType.s8, DataType.s8, DataType.s32, \ | |||||
OpcodeClass.Simt, \ | |||||
MathOperation.multiply_add), | |||||
] | |||||
dst_layouts = [ | |||||
LayoutType.TensorNC4HW4, | |||||
LayoutType.TensorNC32HW32, | |||||
LayoutType.TensorNHWC, | |||||
LayoutType.TensorNHWC, | |||||
LayoutType.TensorNCHW | |||||
] | |||||
dst_types = [ | |||||
DataType.s8, | |||||
DataType.s8, | |||||
DataType.u4, | |||||
DataType.s4, | |||||
DataType.f32, | |||||
] | |||||
max_cc = 1024 | |||||
for math_inst in math_instructions: | |||||
for layout in layouts: | |||||
for dst_type, dst_layout in zip(dst_types, dst_layouts): | |||||
if dst_type == DataType.s4 or dst_type == DataType.u4: | |||||
min_cc = 75 | |||||
skip_unity_kernel = True | |||||
else: | |||||
min_cc = 61 | |||||
skip_unity_kernel = False | |||||
tile_descriptions = [ | |||||
TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 64, 64, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 32, 64, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 32, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
] | |||||
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||||
dst_layout, dst_type, min_cc, 32, 32, 32, | |||||
skip_unity_kernel) | |||||
return operations | |||||
def GenerateConv2d_TensorOp_8816(args): | |||||
operations = [] | |||||
layouts = [ | |||||
(LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32), | |||||
] | |||||
math_instructions = [ | |||||
MathInstruction( \ | |||||
[8, 8, 16], \ | |||||
DataType.s8, DataType.s8, DataType.s32, \ | |||||
OpcodeClass.TensorOp, \ | |||||
MathOperation.multiply_add_saturate), | |||||
] | |||||
dst_layouts = [ | |||||
LayoutType.TensorNC32HW32, | |||||
LayoutType.TensorNC4HW4, | |||||
] | |||||
dst_types = [ | |||||
DataType.s8, | |||||
DataType.s8, | |||||
] | |||||
min_cc = 75 | |||||
max_cc = 1024 | |||||
for math_inst in math_instructions: | |||||
for layout in layouts: | |||||
for dst_type, dst_layout in zip(dst_types, dst_layouts): | |||||
if dst_layout == LayoutType.TensorNC32HW32: | |||||
tile_descriptions = [ | |||||
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 32, 64, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc), | |||||
] | |||||
else: | |||||
assert dst_layout == LayoutType.TensorNC4HW4 | |||||
tile_descriptions = [ | |||||
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 32, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
] | |||||
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||||
dst_layout, dst_type, min_cc, 128, 128, 64, | |||||
False) | |||||
return operations | |||||
def GenerateConv2d_TensorOp_8832(args): | |||||
operations = [] | |||||
layouts = [ | |||||
(LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64), | |||||
] | |||||
math_instructions = [ | |||||
MathInstruction( \ | |||||
[8, 8, 32], \ | |||||
DataType.s4, DataType.s4, DataType.s32, \ | |||||
OpcodeClass.TensorOp, \ | |||||
MathOperation.multiply_add_saturate), \ | |||||
MathInstruction( \ | |||||
[8, 8, 32], \ | |||||
DataType.s4, DataType.u4, DataType.s32, \ | |||||
OpcodeClass.TensorOp, \ | |||||
MathOperation.multiply_add_saturate) | |||||
] | |||||
dst_layouts = [ | |||||
LayoutType.TensorNC64HW64, | |||||
] | |||||
min_cc = 75 | |||||
max_cc = 1024 | |||||
for math_inst in math_instructions: | |||||
for layout in layouts: | |||||
for dst_layout in dst_layouts: | |||||
dst_type = math_inst.element_b | |||||
tile_descriptions = [ | |||||
TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
] | |||||
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||||
dst_layout, dst_type, min_cc, 128, 128, 64, | |||||
True) | |||||
layouts_nhwc = [ | |||||
(LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), | |||||
(LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 64), | |||||
(LayoutType.TensorNHWC, LayoutType.TensorNC32HW32, 128), | |||||
] | |||||
dst_layouts_nhwc = [ | |||||
LayoutType.TensorNHWC, | |||||
] | |||||
for math_inst in math_instructions: | |||||
for layout in layouts_nhwc: | |||||
for dst_layout in dst_layouts_nhwc: | |||||
dst_type = math_inst.element_b | |||||
tile_descriptions = [ | |||||
TileDescription([128, 32, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 64, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||||
] | |||||
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||||
dst_layout, dst_type, min_cc, layout[2], layout[2], 32, | |||||
False, ImplicitGemmMode.GemmTn) | |||||
return operations | |||||
def GenerateDeconv_Simt(args): | |||||
operations = [] | |||||
layouts = [ | |||||
(LayoutType.TensorNC4HW4, LayoutType.TensorK4RSC4), | |||||
] | |||||
math_instructions = [ | |||||
MathInstruction( \ | |||||
[1, 1, 4], \ | |||||
DataType.s8, DataType.s8, DataType.s32, \ | |||||
OpcodeClass.Simt, \ | |||||
MathOperation.multiply_add), | |||||
] | |||||
dst_layouts = [ | |||||
LayoutType.TensorNC4HW4, | |||||
] | |||||
dst_types = [ | |||||
DataType.s8, | |||||
] | |||||
min_cc = 61 | |||||
max_cc = 1024 | |||||
for math_inst in math_instructions: | |||||
for layout in layouts: | |||||
for dst_type, dst_layout in zip(dst_types, dst_layouts): | |||||
tile_descriptions = [ | |||||
TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
] | |||||
operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1], | |||||
dst_layout, dst_type, min_cc, 32, 32, 32, | |||||
True) | |||||
return operations | |||||
################################################################################ | |||||
# parameters | |||||
# Edge - for tiles, the edges represent the length of one side | |||||
# Ratio - the maximum ratio between 2 edges, limits the skinnyness of tiles | |||||
# MaxEdge - maximum length of each edge | |||||
# Min/Max - minimum/maximum of the product of edge lengths | |||||
################################################################################ | |||||
warpsPerThreadblockEdge = [1, 2, 4, 8, 16] | |||||
warpsPerThreadblockRatio = 2 | |||||
warpsPerThreadblockMax = 16 | |||||
# NOTE 1x32 and 2x16 warp tile shapes fail validation for ~10% of cases | |||||
warpShapeEdges = [8, 16, 32, 64, 128, 256] | |||||
warpShapeRatio = 4 | |||||
warpShapeMax = 64*64 | |||||
warpShapeMin = 8*8 | |||||
threadblockEdgeMax = 256 | |||||
# char, type bits/elem, max tile, L0 threadblock tiles | |||||
precisions = { | |||||
"c" : [ "cutlass::complex<float>", 64, 64*128, [ [ 64, 128], [ 64, 32] ] ], | |||||
"d" : [ "double", 64, 64*64, [ [ 64, 64], [ 32, 32] ] ], | |||||
"h" : [ "cutlass::half_t", 16, 128*256, [ [256, 128], [ 64, 128], [ 64, 32] ] ], | |||||
"i" : [ "int", 32, 128*128, [ [128, 64], [ 16, 32] ] ], | |||||
"s" : [ "float", 32, 128*128, [ [128, 256], [128, 128], [ 64, 64] ] ], | |||||
"z" : [ "cutlass::complex<double>", 128, 64*64, [ [ 32, 64], [ 16, 32] ] ], | |||||
} | |||||
# L1 will have a single kernel for every unique shape | |||||
# L2 will have everything else | |||||
def GenerateGemm_Simt(args): | |||||
################################################################################ | |||||
# warps per threadblock | |||||
################################################################################ | |||||
warpsPerThreadblocks = [] | |||||
for warpsPerThreadblock0 in warpsPerThreadblockEdge: | |||||
for warpsPerThreadblock1 in warpsPerThreadblockEdge: | |||||
if warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio \ | |||||
and warpsPerThreadblock1 / warpsPerThreadblock0 <= warpsPerThreadblockRatio \ | |||||
and warpsPerThreadblock0 * warpsPerThreadblock1 <= warpsPerThreadblockMax: | |||||
warpsPerThreadblocks.append([warpsPerThreadblock0, | |||||
warpsPerThreadblock1]) | |||||
################################################################################ | |||||
# warp shapes | |||||
################################################################################ | |||||
warpNumThreads = 32 | |||||
warpShapes = [] | |||||
for warp0 in warpShapeEdges: | |||||
for warp1 in warpShapeEdges: | |||||
if warp0 / warp1 <= warpShapeRatio \ | |||||
and warp1 / warp0 <= warpShapeRatio \ | |||||
and warp0 * warp1 <= warpShapeMax \ | |||||
and warp0*warp1 > warpShapeMin: | |||||
warpShapes.append([warp0, warp1]) | |||||
# sgemm | |||||
precisionType, precisionBits, threadblockMaxElements, threadblockTilesL0 = precisions["s"] | |||||
layouts = [ | |||||
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # nn | |||||
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor), # nt | |||||
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # tn | |||||
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), # tt | |||||
] | |||||
math_instructions = [ | |||||
MathInstruction( \ | |||||
[1, 1, 1], \ | |||||
DataType.f32, DataType.f32, DataType.f32, \ | |||||
OpcodeClass.Simt, \ | |||||
MathOperation.multiply_add), | |||||
] | |||||
min_cc = 50 | |||||
max_cc = 1024 | |||||
operations = [] | |||||
for math_inst in math_instructions: | |||||
for layout in layouts: | |||||
data_type = [ | |||||
math_inst.element_a, | |||||
math_inst.element_b, | |||||
math_inst.element_accumulator, | |||||
math_inst.element_accumulator, | |||||
] | |||||
tile_descriptions = [ | |||||
TileDescription([64, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([256, 64, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 32, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([256, 32, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 32, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 64, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 32, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 8, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 16, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||||
TileDescription([ 16, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||||
] | |||||
for warpsPerThreadblock in warpsPerThreadblocks: | |||||
for warpShape in warpShapes: | |||||
warpThreadsM = 0 | |||||
if warpShape[0] > warpShape[1]: | |||||
warpThreadsM = 8 | |||||
else: | |||||
warpThreadsM = 4 | |||||
warpThreadsN = warpNumThreads / warpThreadsM | |||||
# skip shapes with conflicting rectangularity | |||||
# they are unlikely to be fastest | |||||
blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1] | |||||
blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1] | |||||
warpG = warpShape[0] > warpShape[1] | |||||
warpL = warpShape[0] < warpShape[1] | |||||
blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1]*2 | |||||
blockL2 = warpsPerThreadblock[0]*2 < warpsPerThreadblock[1] | |||||
warpG2 = warpShape[0] > warpShape[1]*2 | |||||
warpL2 = warpShape[0]*2 < warpShape[1] | |||||
if blockG2 and warpL: continue | |||||
if blockL2 and warpG: continue | |||||
if warpG2 and blockL: continue | |||||
if warpL2 and blockG: continue | |||||
# check threadblock ratios and max | |||||
threadblockTile = [warpShape[0]*warpsPerThreadblock[0], | |||||
warpShape[1]*warpsPerThreadblock[1]] | |||||
if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements: continue | |||||
if threadblockTile[0] > threadblockEdgeMax: continue | |||||
if threadblockTile[1] > threadblockEdgeMax: continue | |||||
totalThreads = warpNumThreads*warpsPerThreadblock[0]*warpsPerThreadblock[1] | |||||
# calculate unroll | |||||
# ensure that every iteration at least a full load of A,B are done | |||||
unrollMin = 8 | |||||
unrollMin0 = totalThreads // threadblockTile[0] | |||||
unrollMin1 = totalThreads // threadblockTile[1] | |||||
unroll = max(unrollMin, unrollMin0, unrollMin1) | |||||
threadTileM = warpShape[0] // warpThreadsM | |||||
threadTileN = warpShape[1] // warpThreadsN | |||||
if threadTileM < 2 or threadTileN < 2: continue | |||||
if threadTileM*threadTileN*precisionBits > 8*8*32: continue | |||||
# epilogue currently only supports N < WarpNumThreads | |||||
if threadblockTile[1] < warpNumThreads: continue | |||||
# limit smem | |||||
smemBitsA = threadblockTile[0]*unroll*2*precisionBits | |||||
smemBitsB = threadblockTile[1]*unroll*2*precisionBits | |||||
smemKBytes = (smemBitsA+smemBitsB)/8/1024 | |||||
if (smemKBytes > 48): continue | |||||
tile = TileDescription([threadblockTile[0], threadblockTile[1], unroll], \ | |||||
2, \ | |||||
[threadblockTile[0]//warpShape[0], threadblockTile[1]//warpShape[1], 1], \ | |||||
math_inst, min_cc, max_cc) | |||||
def filter(t: TileDescription) -> bool: | |||||
nonlocal tile | |||||
return t.threadblock_shape[0] == tile.threadblock_shape[0] and \ | |||||
t.threadblock_shape[1] == tile.threadblock_shape[1] and \ | |||||
t.threadblock_shape[2] == tile.threadblock_shape[2] and \ | |||||
t.warp_count[0] == tile.warp_count[0] and \ | |||||
t.warp_count[1] == tile.warp_count[1] and \ | |||||
t.warp_count[2] == tile.warp_count[2] and \ | |||||
t.stages == tile.stages | |||||
if not any(t for t in tile_descriptions if filter(t)): continue | |||||
operations += GeneratesGemm(tile, data_type, layout[0], layout[1], layout[2], min_cc) | |||||
return operations | |||||
# | |||||
def GenerateGemv_Simt(args): | |||||
threadBlockShape_N = [128, 64, 32] | |||||
ldgBits_A = [128, 64, 32] | |||||
ldgBits_B = [128, 64, 32] | |||||
layouts = [ | |||||
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), | |||||
] | |||||
math_instructions = [ | |||||
MathInstruction( \ | |||||
[1, 1, 1], \ | |||||
DataType.f32, DataType.f32, DataType.f32, \ | |||||
OpcodeClass.Simt, \ | |||||
MathOperation.multiply_add), | |||||
] | |||||
min_cc = 50 | |||||
operations = [] | |||||
for math_inst in math_instructions: | |||||
for layout in layouts: | |||||
data_type = [ | |||||
math_inst.element_a, | |||||
math_inst.element_b, | |||||
math_inst.element_accumulator, | |||||
math_inst.element_accumulator, | |||||
] | |||||
for threadblock_shape_n in threadBlockShape_N: | |||||
for align_a in ldgBits_A: | |||||
for align_b in ldgBits_B: | |||||
ldg_elements_a = align_a // DataTypeSize[math_inst.element_a] | |||||
ldg_elements_b = align_b // DataTypeSize[math_inst.element_b] | |||||
threadblock_shape_k = (256 * ldg_elements_a) // (threadblock_shape_n // ldg_elements_b) | |||||
threadblock_shape = [1, threadblock_shape_n, threadblock_shape_k] | |||||
thread_shape = [1, ldg_elements_b, ldg_elements_a] | |||||
operations.append(GeneratesGemv(math_inst, \ | |||||
threadblock_shape, \ | |||||
thread_shape, \ | |||||
data_type, \ | |||||
layout[0], \ | |||||
layout[1], \ | |||||
layout[2], \ | |||||
min_cc, \ | |||||
align_a, \ | |||||
align_b)) | |||||
return operations | |||||
# | |||||
def GenerateConv2dOperations(args): | |||||
if args.type == "simt": | |||||
return GenerateConv2d_Simt(args) | |||||
elif args.type == "tensorop8816": | |||||
return GenerateConv2d_TensorOp_8816(args) | |||||
else: | |||||
assert args.type == "tensorop8832", "operation conv2d only support" \ | |||||
"simt, tensorop8816 and tensorop8832. (got:{})".format(args.type) | |||||
return GenerateConv2d_TensorOp_8832(args) | |||||
def GenerateDeconvOperations(args): | |||||
assert args.type == "simt", "operation deconv only support" \ | |||||
"simt. (got:{})".format(args.type) | |||||
return GenerateDeconv_Simt(args) | |||||
def GenerateGemmOperations(args): | |||||
assert args.type == "simt", "operation gemm only support" \ | |||||
"simt. (got:{})".format(args.type) | |||||
return GenerateGemm_Simt(args) | |||||
def GenerateGemvOperations(args): | |||||
assert args.type == "simt", "operation gemv only support" \ | |||||
"simt. (got:{})".format(args.type) | |||||
return GenerateGemv_Simt(args) | |||||
################################################################################################### | |||||
################################################################################################### | |||||
if __name__ == "__main__": | |||||
parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels") | |||||
parser.add_argument("--operations", type=str, choices=['gemm', 'gemv', 'conv2d', 'deconv'], | |||||
required=True, help="Specifies the operation to generate (gemm, gemv, conv2d, deconv)") | |||||
parser.add_argument("output", type=str, help="output directory for CUTLASS kernel files") | |||||
parser.add_argument("--type", type=str, choices=['simt', 'tensorop8816', 'tensorop8832'], | |||||
default='simt', help="kernel type of CUTLASS kernel generator") | |||||
operation2wrapper_path = { | |||||
"gemm": "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl", \ | |||||
"gemv": "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl", \ | |||||
"conv2d": "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl", \ | |||||
"deconv": "src/cuda/convolution/backward_data/implicit_gemm_deconv_cutlass_wrapper.cuinl", \ | |||||
} | |||||
args = parser.parse_args() | |||||
wrapper_path = operation2wrapper_path[args.operations] | |||||
if args.operations == "gemm": | |||||
operations = GenerateGemmOperations(args) | |||||
elif args.operations == "gemv": | |||||
operations = GenerateGemvOperations(args) | |||||
elif args.operations == "conv2d": | |||||
operations = GenerateConv2dOperations(args) | |||||
elif args.operations == "deconv": | |||||
operations = GenerateDeconvOperations(args) | |||||
if args.operations == "conv2d" or args.operations == "deconv": | |||||
for operation in operations: | |||||
with EmitConvSingleKernelWrapper(args.output, operation, wrapper_path) as emitter: | |||||
emitter.emit() | |||||
elif args.operations == "gemm" or args.operations == "gemv": | |||||
for operation in operations: | |||||
with EmitGemmSingleKernelWrapper(args.output, operation, wrapper_path) as emitter: | |||||
emitter.emit() | |||||
# | |||||
################################################################################################### |
@@ -0,0 +1,27 @@ | |||||
# | |||||
# \file lazy_file.py | |||||
# | |||||
# \brief LazyFile updates the target file only when the content is changed | |||||
# in order to avoid generating new cutlass kimpls each time cmake is called | |||||
# | |||||
import io | |||||
import os | |||||
class LazyFile: | |||||
def __init__(self, filename): | |||||
self.filename = filename | |||||
self.buffer = io.StringIO() | |||||
def write(self, data): | |||||
self.buffer.write(str(data)) | |||||
def close(self): | |||||
if os.path.isfile(self.filename): | |||||
old_data = open(self.filename).read() | |||||
else: | |||||
old_data = "" | |||||
new_data = self.buffer.getvalue() | |||||
if old_data != new_data: | |||||
with open(self.filename, "w") as f: | |||||
f.write(new_data) |
@@ -0,0 +1,614 @@ | |||||
# | |||||
# \file generator.py | |||||
# | |||||
# \brief Generates the CUTLASS Library's instances | |||||
# | |||||
import re | |||||
################################################################################################### | |||||
import enum | |||||
# The following block implements enum.auto() for Python 3.5 variants that don't include it such | |||||
# as the default 3.5.2 on Ubuntu 16.04. | |||||
# | |||||
# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility | |||||
try: | |||||
from enum import auto as enum_auto | |||||
except ImportError: | |||||
__cutlass_library_auto_enum = 0 | |||||
def enum_auto() -> int: | |||||
global __cutlass_library_auto_enum | |||||
i = __cutlass_library_auto_enum | |||||
__cutlass_library_auto_enum += 1 | |||||
return i | |||||
################################################################################################### | |||||
# | |||||
class GeneratorTarget(enum.Enum): | |||||
Library = enum_auto() | |||||
# | |||||
GeneratorTargetNames = { | |||||
GeneratorTarget.Library: 'library' | |||||
} | |||||
# | |||||
################################################################################################### | |||||
# | |||||
class DataType(enum.Enum): | |||||
b1 = enum_auto() | |||||
u4 = enum_auto() | |||||
u8 = enum_auto() | |||||
u16 = enum_auto() | |||||
u32 = enum_auto() | |||||
u64 = enum_auto() | |||||
s4 = enum_auto() | |||||
s8 = enum_auto() | |||||
s16 = enum_auto() | |||||
s32 = enum_auto() | |||||
s64 = enum_auto() | |||||
f16 = enum_auto() | |||||
bf16 = enum_auto() | |||||
f32 = enum_auto() | |||||
tf32 = enum_auto() | |||||
f64 = enum_auto() | |||||
cf16 = enum_auto() | |||||
cbf16 = enum_auto() | |||||
cf32 = enum_auto() | |||||
ctf32 = enum_auto() | |||||
cf64 = enum_auto() | |||||
cs4 = enum_auto() | |||||
cs8 = enum_auto() | |||||
cs16 = enum_auto() | |||||
cs32 = enum_auto() | |||||
cs64 = enum_auto() | |||||
cu4 = enum_auto() | |||||
cu8 = enum_auto() | |||||
cu16 = enum_auto() | |||||
cu32 = enum_auto() | |||||
cu64 = enum_auto() | |||||
invalid = enum_auto() | |||||
# | |||||
ShortDataTypeNames = { | |||||
DataType.s32: 'i', | |||||
DataType.f16: 'h', | |||||
DataType.f32: 's', | |||||
DataType.f64: 'd', | |||||
DataType.cf32: 'c', | |||||
DataType.cf64: 'z', | |||||
} | |||||
# | |||||
DataTypeNames = { | |||||
DataType.b1: "b1", | |||||
DataType.u4: "u4", | |||||
DataType.u8: "u8", | |||||
DataType.u16: "u16", | |||||
DataType.u32: "u32", | |||||
DataType.u64: "u64", | |||||
DataType.s4: "s4", | |||||
DataType.s8: "s8", | |||||
DataType.s16: "s16", | |||||
DataType.s32: "s32", | |||||
DataType.s64: "s64", | |||||
DataType.f16: "f16", | |||||
DataType.bf16: "bf16", | |||||
DataType.f32: "f32", | |||||
DataType.tf32: "tf32", | |||||
DataType.f64: "f64", | |||||
DataType.cf16: "cf16", | |||||
DataType.cbf16: "cbf16", | |||||
DataType.cf32: "cf32", | |||||
DataType.ctf32: "ctf32", | |||||
DataType.cf64: "cf64", | |||||
DataType.cu4: "cu4", | |||||
DataType.cu8: "cu8", | |||||
DataType.cu16: "cu16", | |||||
DataType.cu32: "cu32", | |||||
DataType.cu64: "cu64", | |||||
DataType.cs4: "cs4", | |||||
DataType.cs8: "cs8", | |||||
DataType.cs16: "cs16", | |||||
DataType.cs32: "cs32", | |||||
DataType.cs64: "cs64", | |||||
} | |||||
DataTypeTag = { | |||||
DataType.b1: "cutlass::uint1b_t", | |||||
DataType.u4: "cutlass::uint4b_t", | |||||
DataType.u8: "uint8_t", | |||||
DataType.u16: "uint16_t", | |||||
DataType.u32: "uint32_t", | |||||
DataType.u64: "uint64_t", | |||||
DataType.s4: "cutlass::int4b_t", | |||||
DataType.s8: "int8_t", | |||||
DataType.s16: "int16_t", | |||||
DataType.s32: "int32_t", | |||||
DataType.s64: "int64_t", | |||||
DataType.f16: "cutlass::half_t", | |||||
DataType.bf16: "cutlass::bfloat16_t", | |||||
DataType.f32: "float", | |||||
DataType.tf32: "cutlass::tfloat32_t", | |||||
DataType.f64: "double", | |||||
DataType.cf16: "cutlass::complex<cutlass::half_t>", | |||||
DataType.cbf16: "cutlass::complex<cutlass::bfloat16_t>", | |||||
DataType.cf32: "cutlass::complex<float>", | |||||
DataType.ctf32: "cutlass::complex<cutlass::tfloat32_t>", | |||||
DataType.cf64: "cutlass::complex<double>", | |||||
DataType.cu4: "cutlass::complex<cutlass::uint4b_t>", | |||||
DataType.cu8: "cutlass::complex<cutlass::uint8_t>", | |||||
DataType.cu16: "cutlass::complex<cutlass::uint16_t>", | |||||
DataType.cu32: "cutlass::complex<cutlass::uint32_t>", | |||||
DataType.cu64: "cutlass::complex<cutlass::uint64_t>", | |||||
DataType.cs4: "cutlass::complex<cutlass::int4b_t>", | |||||
DataType.cs8: "cutlass::complex<cutlass::int8_t>", | |||||
DataType.cs16: "cutlass::complex<cutlass::int16_t>", | |||||
DataType.cs32: "cutlass::complex<cutlass::int32_t>", | |||||
DataType.cs64: "cutlass::complex<cutlass::int64_t>", | |||||
} | |||||
DataTypeSize = { | |||||
DataType.b1: 1, | |||||
DataType.u4: 4, | |||||
DataType.u8: 4, | |||||
DataType.u16: 16, | |||||
DataType.u32: 32, | |||||
DataType.u64: 64, | |||||
DataType.s4: 4, | |||||
DataType.s8: 8, | |||||
DataType.s16: 16, | |||||
DataType.s32: 32, | |||||
DataType.s64: 64, | |||||
DataType.f16: 16, | |||||
DataType.bf16: 16, | |||||
DataType.f32: 32, | |||||
DataType.tf32: 32, | |||||
DataType.f64: 64, | |||||
DataType.cf16: 32, | |||||
DataType.cbf16: 32, | |||||
DataType.cf32: 64, | |||||
DataType.ctf32: 32, | |||||
DataType.cf64: 128, | |||||
DataType.cu4: 8, | |||||
DataType.cu8: 16, | |||||
DataType.cu16: 32, | |||||
DataType.cu32: 64, | |||||
DataType.cu64: 128, | |||||
DataType.cs4: 8, | |||||
DataType.cs8: 16, | |||||
DataType.cs16: 32, | |||||
DataType.cs32: 64, | |||||
DataType.cs64: 128, | |||||
} | |||||
################################################################################################### | |||||
# | |||||
class ComplexTransform(enum.Enum): | |||||
none = enum_auto() | |||||
conj = enum_auto() | |||||
# | |||||
ComplexTransformTag = { | |||||
ComplexTransform.none: 'cutlass::ComplexTransform::kNone', | |||||
ComplexTransform.conj: 'cutlass::ComplexTransform::kConjugate', | |||||
} | |||||
# | |||||
RealComplexBijection = [ | |||||
(DataType.f16, DataType.cf16), | |||||
(DataType.f32, DataType.cf32), | |||||
(DataType.f64, DataType.cf64), | |||||
] | |||||
# | |||||
def is_complex(data_type): | |||||
for r, c in RealComplexBijection: | |||||
if data_type == c: | |||||
return True | |||||
return False | |||||
# | |||||
def get_complex_from_real(real_type): | |||||
for r, c in RealComplexBijection: | |||||
if real_type == r: | |||||
return c | |||||
return DataType.invalid | |||||
# | |||||
def get_real_from_complex(complex_type): | |||||
for r, c in RealComplexBijection: | |||||
if complex_type == c: | |||||
return r | |||||
return DataType.invalid | |||||
# | |||||
class ComplexMultiplyOp(enum.Enum): | |||||
multiply_add = enum_auto() | |||||
gaussian = enum_auto() | |||||
################################################################################################### | |||||
# | |||||
class MathOperation(enum.Enum): | |||||
multiply_add = enum_auto() | |||||
multiply_add_saturate = enum_auto() | |||||
xor_popc = enum_auto() | |||||
multiply_add_fast_bf16 = enum_auto() | |||||
multiply_add_fast_f16 = enum_auto() | |||||
multiply_add_complex = enum_auto() | |||||
multiply_add_complex_gaussian = enum_auto() | |||||
# | |||||
MathOperationTag = { | |||||
MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd', | |||||
MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate', | |||||
MathOperation.xor_popc: 'cutlass::arch::OpXorPopc', | |||||
MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16', | |||||
MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16', | |||||
MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex', | |||||
MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex', | |||||
} | |||||
################################################################################################### | |||||
# | |||||
class LayoutType(enum.Enum): | |||||
ColumnMajor = enum_auto() | |||||
RowMajor = enum_auto() | |||||
ColumnMajorInterleaved2 = enum_auto() | |||||
RowMajorInterleaved2 = enum_auto() | |||||
ColumnMajorInterleaved32 = enum_auto() | |||||
RowMajorInterleaved32 = enum_auto() | |||||
ColumnMajorInterleaved64 = enum_auto() | |||||
RowMajorInterleaved64 = enum_auto() | |||||
TensorNHWC = enum_auto() | |||||
TensorNDHWC = enum_auto() | |||||
TensorNCHW = enum_auto() | |||||
TensorNGHWC = enum_auto() | |||||
TensorNC4HW4 = enum_auto() | |||||
TensorC4RSK4 = enum_auto() | |||||
TensorNC8HW8 = enum_auto() | |||||
TensorNC16HW16 = enum_auto() | |||||
TensorNC32HW32 = enum_auto() | |||||
TensorNC64HW64 = enum_auto() | |||||
TensorC32RSK32 = enum_auto() | |||||
TensorC64RSK64 = enum_auto() | |||||
TensorK4RSC4 = enum_auto() | |||||
# | |||||
LayoutTag = { | |||||
LayoutType.ColumnMajor: 'cutlass::layout::ColumnMajor', | |||||
LayoutType.RowMajor: 'cutlass::layout::RowMajor', | |||||
LayoutType.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>', | |||||
LayoutType.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>', | |||||
LayoutType.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>', | |||||
LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>', | |||||
LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>', | |||||
LayoutType.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>', | |||||
LayoutType.TensorNHWC: 'cutlass::layout::TensorNHWC', | |||||
LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC', | |||||
LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW', | |||||
LayoutType.TensorNGHWC: 'cutlass::layout::TensorNGHWC', | |||||
LayoutType.TensorNC4HW4: 'cutlass::layout::TensorNCxHWx<4>', | |||||
LayoutType.TensorC4RSK4: 'cutlass::layout::TensorCxRSKx<4>', | |||||
LayoutType.TensorNC8HW8: 'cutlass::layout::TensorNCxHWx<8>', | |||||
LayoutType.TensorNC16HW16: 'cutlass::layout::TensorNCxHWx<16>', | |||||
LayoutType.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>', | |||||
LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>', | |||||
LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>', | |||||
LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>', | |||||
LayoutType.TensorK4RSC4: 'cutlass::layout::TensorKxRSCx<4>', | |||||
} | |||||
# | |||||
TransposedLayout = { | |||||
LayoutType.ColumnMajor: LayoutType.RowMajor, | |||||
LayoutType.RowMajor: LayoutType.ColumnMajor, | |||||
LayoutType.ColumnMajorInterleaved2: LayoutType.RowMajorInterleaved2, | |||||
LayoutType.RowMajorInterleaved2: LayoutType.ColumnMajorInterleaved2, | |||||
LayoutType.ColumnMajorInterleaved32: LayoutType.RowMajorInterleaved32, | |||||
LayoutType.RowMajorInterleaved32: LayoutType.ColumnMajorInterleaved32, | |||||
LayoutType.ColumnMajorInterleaved64: LayoutType.RowMajorInterleaved64, | |||||
LayoutType.RowMajorInterleaved64: LayoutType.ColumnMajorInterleaved64, | |||||
LayoutType.TensorNHWC: LayoutType.TensorNHWC | |||||
} | |||||
# | |||||
ShortLayoutTypeNames = { | |||||
LayoutType.ColumnMajor: 'n', | |||||
LayoutType.ColumnMajorInterleaved32: 'n2', | |||||
LayoutType.ColumnMajorInterleaved32: 'n32', | |||||
LayoutType.ColumnMajorInterleaved64: 'n64', | |||||
LayoutType.RowMajor: 't', | |||||
LayoutType.RowMajorInterleaved2: 't2', | |||||
LayoutType.RowMajorInterleaved32: 't32', | |||||
LayoutType.RowMajorInterleaved64: 't64', | |||||
LayoutType.TensorNHWC: 'nhwc', | |||||
LayoutType.TensorNDHWC: 'ndhwc', | |||||
LayoutType.TensorNCHW: 'nchw', | |||||
LayoutType.TensorNGHWC: 'nghwc', | |||||
LayoutType.TensorNC4HW4: 'nc4hw4', | |||||
LayoutType.TensorC4RSK4: 'c4rsk4', | |||||
LayoutType.TensorNC8HW8: 'nc8hw8', | |||||
LayoutType.TensorNC16HW16: 'nc16hw16', | |||||
LayoutType.TensorNC32HW32: 'nc32hw32', | |||||
LayoutType.TensorNC64HW64: 'nc64hw64', | |||||
LayoutType.TensorC32RSK32: 'c32rsk32', | |||||
LayoutType.TensorC64RSK64: 'c64rsk64', | |||||
LayoutType.TensorK4RSC4: 'k4rsc4', | |||||
} | |||||
# | |||||
ShortComplexLayoutNames = { | |||||
(LayoutType.ColumnMajor, ComplexTransform.none): 'n', | |||||
(LayoutType.ColumnMajor, ComplexTransform.conj): 'c', | |||||
(LayoutType.RowMajor, ComplexTransform.none): 't', | |||||
(LayoutType.RowMajor, ComplexTransform.conj): 'h' | |||||
} | |||||
################################################################################################### | |||||
# | |||||
class OpcodeClass(enum.Enum): | |||||
Simt = enum_auto() | |||||
TensorOp = enum_auto() | |||||
WmmaTensorOp = enum_auto() | |||||
OpcodeClassNames = { | |||||
OpcodeClass.Simt: 'simt', | |||||
OpcodeClass.TensorOp: 'tensorop', | |||||
OpcodeClass.WmmaTensorOp: 'wmma_tensorop', | |||||
} | |||||
OpcodeClassTag = { | |||||
OpcodeClass.Simt: 'cutlass::arch::OpClassSimt', | |||||
OpcodeClass.TensorOp: 'cutlass::arch::OpClassTensorOp', | |||||
OpcodeClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp', | |||||
} | |||||
################################################################################################### | |||||
# | |||||
class OperationKind(enum.Enum): | |||||
Gemm = enum_auto() | |||||
Conv2d = enum_auto() | |||||
# | |||||
OperationKindNames = { | |||||
OperationKind.Gemm: 'gemm' | |||||
, OperationKind.Conv2d: 'conv2d' | |||||
} | |||||
# | |||||
class Target(enum.Enum): | |||||
library = enum_auto() | |||||
ArchitectureNames = { | |||||
50: 'maxwell', | |||||
60: 'pascal', | |||||
61: 'pascal', | |||||
70: 'volta', | |||||
75: 'turing', | |||||
80: 'ampere', | |||||
} | |||||
################################################################################################### | |||||
# | |||||
def SubstituteTemplate(template, values): | |||||
text = template | |||||
changed = True | |||||
while changed: | |||||
changed = False | |||||
for key, value in values.items(): | |||||
regex = "\\$\\{%s\\}" % key | |||||
newtext = re.sub(regex, value, text) | |||||
if newtext != text: | |||||
changed = True | |||||
text = newtext | |||||
return text | |||||
################################################################################################### | |||||
# | |||||
class GemmKind(enum.Enum): | |||||
Gemm = enum_auto() | |||||
Sparse = enum_auto() | |||||
Universal = enum_auto() | |||||
PlanarComplex = enum_auto() | |||||
PlanarComplexArray = enum_auto() | |||||
SplitKParallel = enum_auto() | |||||
GemvBatchedStrided = enum_auto() | |||||
# | |||||
GemmKindNames = { | |||||
GemmKind.Gemm: "gemm", | |||||
GemmKind.Sparse: "spgemm", | |||||
GemmKind.Universal: "gemm", | |||||
GemmKind.PlanarComplex: "gemm_planar_complex", | |||||
GemmKind.PlanarComplexArray: "gemm_planar_complex_array", | |||||
GemmKind.SplitKParallel: "gemm_split_k_parallel", | |||||
GemmKind.GemvBatchedStrided: "gemv_batched_strided", | |||||
} | |||||
# | |||||
class EpilogueFunctor(enum.Enum): | |||||
LinearCombination = enum_auto() | |||||
LinearCombinationClamp = enum_auto() | |||||
BiasAddLinearCombination = enum_auto() | |||||
BiasAddLinearCombinationRelu = enum_auto() | |||||
BiasAddLinearCombinationHSwish = enum_auto() | |||||
BiasAddLinearCombinationClamp = enum_auto() | |||||
BiasAddLinearCombinationReluClamp = enum_auto() | |||||
BiasAddLinearCombinationHSwishClamp = enum_auto() | |||||
# | |||||
EpilogueFunctorTag = { | |||||
EpilogueFunctor.LinearCombination: 'cutlass::epilogue::thread::LinearCombination', | |||||
EpilogueFunctor.LinearCombinationClamp: 'cutlass::epilogue::thread::LinearCombinationClamp', | |||||
EpilogueFunctor.BiasAddLinearCombination: 'cutlass::epilogue::thread::BiasAddLinearCombination', | |||||
EpilogueFunctor.BiasAddLinearCombinationRelu: 'cutlass::epilogue::thread::BiasAddLinearCombinationRelu', | |||||
EpilogueFunctor.BiasAddLinearCombinationHSwish: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwish', | |||||
EpilogueFunctor.BiasAddLinearCombinationClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationClamp', | |||||
EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp', | |||||
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp', | |||||
} | |||||
# | |||||
ShortEpilogueNames = { | |||||
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'hswish', | |||||
EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'relu', | |||||
EpilogueFunctor.BiasAddLinearCombinationClamp: 'identity', | |||||
EpilogueFunctor.BiasAddLinearCombinationHSwish: 'hswish', | |||||
EpilogueFunctor.BiasAddLinearCombinationRelu: 'relu', | |||||
EpilogueFunctor.BiasAddLinearCombination: 'identity', | |||||
} | |||||
# | |||||
class SwizzlingFunctor(enum.Enum): | |||||
Identity1 = enum_auto() | |||||
Identity2 = enum_auto() | |||||
Identity4 = enum_auto() | |||||
Identity8 = enum_auto() | |||||
ConvFpropNCxHWx = enum_auto() | |||||
ConvFpropNHWC = enum_auto() | |||||
ConvDgradNCxHWx = enum_auto() | |||||
# | |||||
SwizzlingFunctorTag = { | |||||
SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>', | |||||
SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>', | |||||
SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>', | |||||
SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>', | |||||
SwizzlingFunctor.ConvFpropNCxHWx: 'cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle', | |||||
SwizzlingFunctor.ConvFpropNHWC: 'cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle', | |||||
SwizzlingFunctor.ConvDgradNCxHWx: 'cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle', | |||||
} | |||||
################################################################################################### | |||||
class ConvType(enum.Enum): | |||||
Convolution = enum_auto() | |||||
BatchConvolution = enum_auto() | |||||
Local = enum_auto() | |||||
LocalShare = enum_auto() | |||||
ConvTypeTag = { | |||||
ConvType.Convolution: 'cutlass::conv::ConvType::kConvolution', | |||||
ConvType.BatchConvolution: 'cutlass::conv::ConvType::kBatchConvolution', | |||||
ConvType.Local: 'cutlass::conv::ConvType::kLocal', | |||||
ConvType.LocalShare : 'cutlass::conv::ConvType::kLocalShare', | |||||
} | |||||
# | |||||
class ConvKind(enum.Enum): | |||||
Fprop = enum_auto() | |||||
Dgrad = enum_auto() | |||||
Wgrad = enum_auto() | |||||
# | |||||
ConvKindTag = { | |||||
ConvKind.Fprop: 'cutlass::conv::Operator::kFprop', | |||||
ConvKind.Dgrad: 'cutlass::conv::Operator::kDgrad', | |||||
ConvKind.Wgrad: 'cutlass::conv::Operator::kWgrad' | |||||
} | |||||
ConvKindNames = { | |||||
ConvKind.Fprop: 'fprop', | |||||
ConvKind.Dgrad: 'dgrad', | |||||
ConvKind.Wgrad: 'wgrad', | |||||
} | |||||
# | |||||
class IteratorAlgorithm(enum.Enum): | |||||
Analytic = enum_auto() | |||||
Optimized = enum_auto() | |||||
# | |||||
IteratorAlgorithmTag = { | |||||
IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic', | |||||
IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized', | |||||
} | |||||
IteratorAlgorithmNames = { | |||||
IteratorAlgorithm.Analytic: 'analytic', | |||||
IteratorAlgorithm.Optimized: 'optimized', | |||||
} | |||||
# | |||||
class StrideSupport(enum.Enum): | |||||
Strided = enum_auto() | |||||
Unity = enum_auto() | |||||
# | |||||
StrideSupportTag = { | |||||
StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided', | |||||
StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity', | |||||
} | |||||
StrideSupportNames = { | |||||
StrideSupport.Strided: '', | |||||
StrideSupport.Unity: 'unity_stride', | |||||
} | |||||
class ImplicitGemmMode(enum.Enum): | |||||
GemmNt = enum_auto() | |||||
GemmTn = enum_auto() | |||||
ImplicitGemmModeNames = { | |||||
ImplicitGemmMode.GemmNt: 'gemm_nt', | |||||
ImplicitGemmMode.GemmTn: 'gemm_tn', | |||||
} | |||||
ImplicitGemmModeTag = { | |||||
ImplicitGemmMode.GemmNt: 'cutlass::conv::ImplicitGemmMode::GEMM_NT', | |||||
ImplicitGemmMode.GemmTn: 'cutlass::conv::ImplicitGemmMode::GEMM_TN', | |||||
} | |||||
################################################################################################### | |||||
# | |||||
class MathInstruction: | |||||
def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class, math_operation = MathOperation.multiply_add): | |||||
self.instruction_shape = instruction_shape | |||||
self.element_a = element_a | |||||
self.element_b = element_b | |||||
self.element_accumulator = element_accumulator | |||||
self.opcode_class = opcode_class | |||||
self.math_operation = math_operation | |||||
# | |||||
class TileDescription: | |||||
def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute): | |||||
self.threadblock_shape = threadblock_shape | |||||
self.stages = stages | |||||
self.warp_count = warp_count | |||||
self.math_instruction = math_instruction | |||||
self.minimum_compute_capability = min_compute | |||||
self.maximum_compute_capability = max_compute | |||||
def procedural_name(self): | |||||
return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages) | |||||
# | |||||
class TensorDescription: | |||||
def __init__(self, element, layout, alignment = 1, complex_transform = ComplexTransform.none): | |||||
self.element = element | |||||
self.layout = layout | |||||
self.alignment = alignment | |||||
self.complex_transform = complex_transform | |||||
################################################################################################### |
@@ -0,0 +1,578 @@ | |||||
# Generated by dnn/scripts/cutlass_generator/gen_list.py | |||||
cutlass_gen_list = [ | |||||
"cutlass_simt_sgemm_8x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_16x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_16x64_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_32x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_32x64_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_64x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_16x128_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_32x128_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_64x64_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_128x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_64x128_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_128x64_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_32x256_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_64x256_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_128x128_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_256x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_256x64_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu", | |||||
"cutlass_simt_sgemm_8x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_16x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_16x64_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_32x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_32x64_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_64x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_16x128_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_32x128_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_64x64_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_128x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_64x128_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_128x64_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_32x256_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_64x256_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_128x128_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_256x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_256x64_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu", | |||||
"cutlass_simt_sgemm_8x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_16x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_16x64_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_32x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_32x64_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_64x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_16x128_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_32x128_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_64x64_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_128x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_64x128_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_128x64_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_32x256_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_64x256_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_128x128_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_256x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_256x64_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu", | |||||
"cutlass_simt_sgemm_8x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_16x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_16x64_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_32x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_32x64_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_64x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_16x128_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_32x128_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_64x64_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_128x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_64x128_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_128x64_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_32x256_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_64x256_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_128x128_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_256x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_256x64_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu", | |||||
"cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu", | |||||
"cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu", | |||||
"cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu", | |||||
"cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu", | |||||
"cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu", | |||||
"cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||||
] |
@@ -0,0 +1,351 @@ | |||||
# | |||||
# \file generator.py | |||||
# | |||||
# \brief Generates the CUTLASS Library's instances | |||||
# | |||||
import enum | |||||
import os.path | |||||
import shutil | |||||
from library import * | |||||
from gemm_operation import * | |||||
from conv2d_operation import * | |||||
################################################################################################### | |||||
class EmitOperationKindLibrary: | |||||
def __init__(self, generated_path, kind, args): | |||||
self.generated_path = generated_path | |||||
self.kind = kind | |||||
self.args = args | |||||
self.emitters = { | |||||
OperationKind.Gemm: EmitGemmConfigurationLibrary | |||||
, OperationKind.Conv2d: EmitConv2dConfigurationLibrary | |||||
} | |||||
self.configurations = []; | |||||
self.header_template =""" | |||||
/* | |||||
Generated by manifest.py - Do not edit. | |||||
*/ | |||||
#include "cutlass/cutlass.h" | |||||
#include "cutlass/library/library.h" | |||||
#include "cutlass/library/manifest.h" | |||||
namespace cutlass { | |||||
namespace library { | |||||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
""" | |||||
self.entry_template = """ | |||||
// | |||||
// Entry point to construct operations | |||||
// | |||||
void initialize_all_${operation_name}_operations(Manifest &manifest) { | |||||
""" | |||||
self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n" | |||||
self.configuration_template =" initialize_${configuration_name}(manifest);\n" | |||||
self.epilogue_template =""" | |||||
} | |||||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||||
} // namespace library | |||||
} // namespace cutlass | |||||
""" | |||||
# | |||||
def __enter__(self): | |||||
self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind]) | |||||
os.mkdir(self.operation_path) | |||||
self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind]) | |||||
self.top_level_file = open(self.top_level_path, "w") | |||||
self.top_level_file.write(self.header_template) | |||||
self.source_files = [self.top_level_path,] | |||||
return self | |||||
# | |||||
def emit(self, configuration_name, operations): | |||||
with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter: | |||||
for operation in operations: | |||||
configuration_emitter.emit(operation) | |||||
self.source_files.append(configuration_emitter.configuration_path) | |||||
self.configurations.append(configuration_name) | |||||
self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} )) | |||||
# | |||||
def __exit__(self, exception_type, exception_value, traceback): | |||||
self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]})) | |||||
for configuration_name in self.configurations: | |||||
self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name})) | |||||
self.top_level_file.write(self.epilogue_template) | |||||
self.top_level_file.close() | |||||
################################################################################################### | |||||
################################################################################################### | |||||
class Options: | |||||
def __init__(self): | |||||
pass | |||||
################################################################################################### | |||||
# | |||||
class Manifest: | |||||
# | |||||
def __init__(self, args): | |||||
self.operations = {} | |||||
self.args = args | |||||
architectures = args.architectures.split(';') if len(args.architectures) else ['50',] | |||||
self.compute_capabilities = [int(x) for x in architectures] | |||||
self.selected_kernels = [] | |||||
if args.operations == 'all': | |||||
self.operations_enabled = [] | |||||
else: | |||||
operations_list = [ | |||||
OperationKind.Gemm | |||||
, OperationKind.Conv2d | |||||
] | |||||
self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')] | |||||
if args.kernels == 'all': | |||||
self.kernel_names = [] | |||||
else: | |||||
self.kernel_names = [x for x in args.kernels.split(',') if x != ''] | |||||
self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != ''] | |||||
if args.kernel_filter_file is None: | |||||
self.kernel_filter_list = [] | |||||
else: | |||||
self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file) | |||||
self.operation_count = 0 | |||||
self.operations_by_name = {} | |||||
self.top_level_prologue = ''' | |||||
#include "cutlass/library/library.h" | |||||
#include "cutlass/library/manifest.h" | |||||
namespace cutlass { | |||||
namespace library { | |||||
${prototypes} | |||||
void initialize_all(Manifest &manifest) { | |||||
''' | |||||
self.top_level_reserve = ' manifest.reserve(${operation_count});\n\n' | |||||
self.top_level_epilogue = ''' | |||||
} | |||||
} // namespace library | |||||
} // namespace cutlass | |||||
''' | |||||
def get_kernel_filters (self, kernelListFile): | |||||
if os.path.isfile(kernelListFile): | |||||
with open(kernelListFile, 'r') as fileReader: | |||||
lines = [line.rstrip() for line in fileReader if not line.startswith("#")] | |||||
lines = [re.compile(line) for line in lines if line] | |||||
return lines | |||||
else: | |||||
return [] | |||||
def filter_out_kernels(self, kernel_name, kernel_filter_list): | |||||
for kernel_filter_re in kernel_filter_list: | |||||
if kernel_filter_re.search(kernel_name) is not None: | |||||
return True | |||||
return False | |||||
# | |||||
def _filter_string_matches(self, filter_string, haystack): | |||||
''' Returns true if all substrings appear in the haystack in order''' | |||||
substrings = filter_string.split('*') | |||||
for sub in substrings: | |||||
idx = haystack.find(sub) | |||||
if idx < 0: | |||||
return False | |||||
haystack = haystack[idx + len(sub):] | |||||
return True | |||||
# | |||||
def filter(self, operation): | |||||
''' Filtering operations based on various criteria''' | |||||
# filter based on compute capability | |||||
enabled = False | |||||
for cc in self.compute_capabilities: | |||||
if cc >= operation.tile_description.minimum_compute_capability and \ | |||||
cc <= operation.tile_description.maximum_compute_capability: | |||||
enabled = True | |||||
break | |||||
if not enabled: | |||||
return False | |||||
if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled: | |||||
return False | |||||
# eliminate duplicates | |||||
if operation.procedural_name() in self.operations_by_name.keys(): | |||||
return False | |||||
# Filter based on list of valid substrings | |||||
if len(self.kernel_names): | |||||
name = operation.procedural_name() | |||||
enabled = False | |||||
# compare against the include list | |||||
for name_substr in self.kernel_names: | |||||
if self._filter_string_matches(name_substr, name): | |||||
enabled = True | |||||
break | |||||
# compare against the exclude list | |||||
for name_substr in self.ignore_kernel_names: | |||||
if self._filter_string_matches(name_substr, name): | |||||
enabled = False | |||||
break | |||||
if len(self.kernel_filter_list) > 0: | |||||
enabled = False | |||||
if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list): | |||||
enabled = True | |||||
# todo: filter based on compute data type | |||||
return enabled | |||||
# | |||||
# | |||||
def append(self, operation): | |||||
''' | |||||
Inserts the operation. | |||||
operation_kind -> configuration_name -> [] | |||||
''' | |||||
if self.filter(operation): | |||||
self.selected_kernels.append(operation.procedural_name()) | |||||
self.operations_by_name[operation.procedural_name()] = operation | |||||
# add the configuration | |||||
configuration_name = operation.configuration_name() | |||||
if operation.operation_kind not in self.operations.keys(): | |||||
self.operations[operation.operation_kind] = {} | |||||
if configuration_name not in self.operations[operation.operation_kind].keys(): | |||||
self.operations[operation.operation_kind][configuration_name] = [] | |||||
self.operations[operation.operation_kind][configuration_name].append(operation) | |||||
self.operation_count += 1 | |||||
# | |||||
# | |||||
def emit(self, target = GeneratorTarget.Library): | |||||
operation_emitters = { | |||||
GeneratorTarget.Library: EmitOperationKindLibrary | |||||
} | |||||
generated_path = os.path.join(self.args.curr_build_dir, 'generated') | |||||
# create generated/ | |||||
if os.path.exists(generated_path): | |||||
shutil.rmtree(generated_path) | |||||
os.mkdir(generated_path) | |||||
source_files = [] | |||||
top_level_path = os.path.join(generated_path, 'initialize_all.cpp') | |||||
with open(top_level_path, 'w') as top_level_file: | |||||
if target == GeneratorTarget.Library: | |||||
source_files.append(top_level_path) | |||||
prototypes = [] | |||||
for operation_kind, configurations in self.operations.items(): | |||||
prototypes.append(SubstituteTemplate( | |||||
"void initialize_all_${operation_kind}_operations(Manifest &manifest);", | |||||
{'operation_kind': OperationKindNames[operation_kind]})) | |||||
top_level_file.write(SubstituteTemplate(self.top_level_prologue, | |||||
{'prototypes': "\n".join(prototypes)})) | |||||
top_level_file.write(SubstituteTemplate( | |||||
self.top_level_reserve, {'operation_count': str(self.operation_count)})) | |||||
# for each operation kind, emit initializer for all configurations | |||||
for operation_kind, configurations in self.operations.items(): | |||||
with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter: | |||||
for configuration_name, operations in configurations.items(): | |||||
operation_kind_emitter.emit(configuration_name, operations) | |||||
source_files += operation_kind_emitter.source_files | |||||
top_level_file.write(SubstituteTemplate( | |||||
" initialize_all_${operation_kind}_operations(manifest);\n", | |||||
{'operation_kind': OperationKindNames[operation_kind]})) | |||||
top_level_file.write(self.top_level_epilogue) | |||||
# write the manifest.cmake file containing paths from all targets | |||||
manifest_path = os.path.join(generated_path, "manifest.cmake") | |||||
with open(manifest_path, "w") as manifest_file: | |||||
target_name = 'cutlass_library_objs' | |||||
target_text = SubstituteTemplate("""cutlass_target_sources( | |||||
${target_name} | |||||
BATCH_SOURCES ON | |||||
PRIVATE | |||||
""", { 'target_name': target_name}) | |||||
manifest_file.write(target_text) | |||||
for source_file in source_files: | |||||
manifest_file.write(" %s\n" % str(source_file.replace('\\', '/'))) | |||||
manifest_file.write(")") | |||||
# | |||||
################################################################################################### |
@@ -113,6 +113,31 @@ if(MGE_WITH_CUDA) | |||||
list(APPEND SOURCES ${SOURCES_}) | list(APPEND SOURCES ${SOURCES_}) | ||||
file(GLOB_RECURSE CUSOURCES cuda/*.cu) | file(GLOB_RECURSE CUSOURCES cuda/*.cu) | ||||
set(CUTLASS_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/cutlass_generator/generator.py) | |||||
set(CUTLASS_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuda/cutlass/generated) | |||||
function(gen_cutlass_kimpl op type) | |||||
set(CURRENT_CUTLASS_GEN_DIR ${CUTLASS_GEN_DIR}/${op}_${type}) | |||||
file(MAKE_DIRECTORY ${CURRENT_CUTLASS_GEN_DIR}) | |||||
execute_process( | |||||
COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${CUTLASS_GEN_SCRIPT} --operations ${op} --type ${type} ${CURRENT_CUTLASS_GEN_DIR} | |||||
RESULT_VARIABLE gen_cutlass_result | |||||
OUTPUT_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log | |||||
ERROR_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log | |||||
) | |||||
if (NOT gen_cutlass_result EQUAL 0) | |||||
message(FATAL_ERROR "Error generating library instances. See ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log") | |||||
endif() | |||||
endfunction() | |||||
gen_cutlass_kimpl(gemm simt) | |||||
gen_cutlass_kimpl(gemv simt) | |||||
gen_cutlass_kimpl(deconv simt) | |||||
gen_cutlass_kimpl(conv2d simt) | |||||
gen_cutlass_kimpl(conv2d tensorop8816) | |||||
gen_cutlass_kimpl(conv2d tensorop8832) | |||||
file(GLOB_RECURSE CUTLASS_SOURCES ${CUTLASS_GEN_DIR}/*.cu) | |||||
list(APPEND SOURCES ${CUTLASS_SOURCES}) | |||||
list(APPEND SOURCES ${CUSOURCES}) | list(APPEND SOURCES ${CUSOURCES}) | ||||
endif() | endif() | ||||
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<256, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<256, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<256, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::int4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<256, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
cutlass::uint4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 32, 64>, | |||||
cutlass::gemm::GemmShape<64, 32, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<16>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
16, | |||||
16, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<32>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorNCxHWx<8>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::layout::TensorNHWC, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<128, 64, 64>, | |||||
cutlass::gemm::GemmShape<64, 64, 64>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
8, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||||
2, | |||||
8, | |||||
8, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
cutlass::int4b_t, | |||||
cutlass::layout::TensorCxRSKx<64>, | |||||
cutlass::uint4b_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::layout::TensorNCxHWx<64>, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassTensorOp, | |||||
cutlass::arch::Sm75, | |||||
cutlass::gemm::GemmShape<256, 128, 128>, | |||||
cutlass::gemm::GemmShape<64, 64, 128>, | |||||
cutlass::gemm::GemmShape<8, 8, 32>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
cutlass::uint4b_t, | |||||
16, | |||||
int32_t, | |||||
int32_t, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
32, | |||||
32, | |||||
true, | |||||
cutlass::arch::OpMultiplyAddSaturate, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<128, 128, 32>, | |||||
cutlass::gemm::GemmShape<64, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<128, 32, 32>, | |||||
cutlass::gemm::GemmShape<64, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<128, 64, 32>, | |||||
cutlass::gemm::GemmShape<64, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<16, 128, 16>, | |||||
cutlass::gemm::GemmShape<16, 128, 16>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, | |||||
4, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<16, 64, 8>, | |||||
cutlass::gemm::GemmShape<16, 64, 8>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
4, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<32, 128, 32>, | |||||
cutlass::gemm::GemmShape<32, 64, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<32, 32, 32>, | |||||
cutlass::gemm::GemmShape<32, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<32, 64, 32>, | |||||
cutlass::gemm::GemmShape<32, 64, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<64, 128, 32>, | |||||
cutlass::gemm::GemmShape<64, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<64, 32, 32>, | |||||
cutlass::gemm::GemmShape<64, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<64, 64, 32>, | |||||
cutlass::gemm::GemmShape<64, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<128, 128, 32>, | |||||
cutlass::gemm::GemmShape<64, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<128, 32, 32>, | |||||
cutlass::gemm::GemmShape<64, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<128, 64, 32>, | |||||
cutlass::gemm::GemmShape<64, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<16, 128, 16>, | |||||
cutlass::gemm::GemmShape<16, 128, 16>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, | |||||
4, | |||||
8, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<16, 64, 8>, | |||||
cutlass::gemm::GemmShape<16, 64, 8>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
4, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<32, 128, 32>, | |||||
cutlass::gemm::GemmShape<32, 64, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -1,59 +0,0 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||||
using Convolution = | |||||
typename cutlass::conv::device::Convolution< | |||||
int8_t, | |||||
cutlass::layout::TensorNCxHWx<4>, | |||||
int8_t, | |||||
cutlass::layout::TensorCxRSKx<4>, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
float, | |||||
cutlass::layout::TensorNCHW, | |||||
int32_t, | |||||
cutlass::conv::ConvType::kConvolution, | |||||
cutlass::arch::OpClassSimt, | |||||
cutlass::arch::Sm61, | |||||
cutlass::gemm::GemmShape<32, 32, 32>, | |||||
cutlass::gemm::GemmShape<32, 32, 32>, | |||||
cutlass::gemm::GemmShape<1, 1, 4>, | |||||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||||
float, | |||||
1, | |||||
int32_t, | |||||
float, | |||||
float | |||||
>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, | |||||
4, | |||||
16, | |||||
false, | |||||
cutlass::arch::OpMultiplyAdd, | |||||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream, | |||||
typename Convolution::ExtraParam extra_param); | |||||
#pragma GCC diagnostic pop | |||||
#endif |