GitOrigin-RevId: da3bcfb85a
release-1.5
@@ -1,5 +1,6 @@ | |||
# Mark generated files as binary, ignore them in git diff. | |||
# dnn | |||
dnn/scripts/cutlass_generator/list.bzl binary | |||
dnn/src/cuda/conv_bias/int4/kimpl/* binary | |||
dnn/src/cuda/conv_bias/int8/kimpl/* binary | |||
dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary | |||
@@ -0,0 +1,18 @@ | |||
load("list.bzl", "cutlass_gen_list") | |||
genrule( | |||
name = "cutlass_kimpls", | |||
outs = cutlass_gen_list, | |||
cmd = """GEN=$(location //brain/megbrain/dnn/scripts/cutlass_generator:generator.py) | |||
pwd > /tmp/a | |||
echo $(@D) > /tmp/b | |||
python3 $$GEN --operations gemm --type simt $(@D) | |||
python3 $$GEN --operations gemv --type simt $(@D) | |||
python3 $$GEN --operations deconv --type simt $(@D) | |||
python3 $$GEN --operations conv2d --type simt $(@D) | |||
python3 $$GEN --operations conv2d --type tensorop8816 $(@D) | |||
python3 $$GEN --operations conv2d --type tensorop8832 $(@D) | |||
""", | |||
tools = ["//brain/megbrain/dnn/scripts/cutlass_generator:generator.py"], | |||
visibility = ["//visibility:public"], | |||
) |
@@ -0,0 +1,19 @@ | |||
# Generate device kernel registration code for CUTLASS kernels | |||
## Usage | |||
```bash | |||
python3 generator.py [--operations {gemm, gemv, conv2d, deconv}] [--type {simt, tensorop8816, tensorop8832}] | |||
output | |||
``` | |||
- operations: operation kind, including gemm|gemv|conv2d|deconv | |||
- type: opcode class, simt|tensorop8816|tensorop8832 | |||
- output: the output directory for CUTLASS kernels | |||
## Generate file list for bazel | |||
We generate `list.bzl` because the `genrule` method of bazel requires that the output file list be specified in the analysis phase. | |||
Please call `gen_list.py` when new operations are added. | |||
```bash | |||
python3 gen_list.py | |||
``` |
@@ -0,0 +1,614 @@ | |||
# | |||
# \file generator.py | |||
# | |||
# \brief Generates the CUTLASS Library's instances | |||
# | |||
# | |||
import enum | |||
import os.path | |||
import shutil | |||
from typing import Tuple, List | |||
from lazy_file import LazyFile | |||
from library import * | |||
################################################################################################### | |||
# | |||
class Conv2dOperation: | |||
# | |||
def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \ | |||
epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \ | |||
need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNt): | |||
self.operation_kind = OperationKind.Conv2d | |||
self.conv_kind = conv_kind | |||
self.arch = arch | |||
self.tile_description = tile_description | |||
self.conv_type = conv_type | |||
self.src = src | |||
self.flt = flt | |||
self.bias = bias | |||
self.dst = dst | |||
self.element_epilogue = element_epilogue | |||
self.epilogue_functor = epilogue_functor | |||
self.swizzling_functor = swizzling_functor | |||
self.need_load_from_const = need_load_from_const | |||
self.implicit_gemm_mode = implicit_gemm_mode | |||
# | |||
def accumulator_type(self): | |||
accum = self.tile_description.math_instruction.element_accumulator | |||
return accum | |||
# | |||
def core_name(self): | |||
''' The basic operation kind is prefixed with a letter indicating the accumulation type. ''' | |||
intermediate_type = '' | |||
if self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp: | |||
inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape) | |||
if self.tile_description.math_instruction.element_a != self.flt.element and \ | |||
self.tile_description.math_instruction.element_a != self.accumulator_type(): | |||
intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a] | |||
else: | |||
inst_shape = '' | |||
unity_kernel = '' | |||
if not self.need_load_from_const: | |||
unity_kernel = '_1x1' | |||
return "%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \ | |||
inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \ | |||
ShortEpilogueNames[self.epilogue_functor]) | |||
# | |||
def extended_name(self): | |||
if self.dst.element != self.tile_description.math_instruction.element_accumulator: | |||
if self.src.element != self.flt.element: | |||
extended_name = "${element_dst}_${core_name}_${element_src}_${element_flt}" | |||
elif self.src.element == self.flt.element: | |||
extended_name = "${element_dst}_${core_name}_${element_src}" | |||
else: | |||
if self.src.element != self.flt.element: | |||
extended_name = "${core_name}_${element_src}_${element_flt}" | |||
elif self.src.element == self.flt.element: | |||
extended_name = "${core_name}_${element_src}" | |||
extended_name = SubstituteTemplate(extended_name, { | |||
'element_src': DataTypeNames[self.src.element], | |||
'element_flt': DataTypeNames[self.flt.element], | |||
'element_dst': DataTypeNames[self.dst.element], | |||
'core_name': self.core_name() | |||
}) | |||
return extended_name | |||
# | |||
def layout_name(self): | |||
if self.src.layout == self.dst.layout: | |||
layout_name = "${src_layout}_${flt_layout}" | |||
else: | |||
layout_name = "${src_layout}_${flt_layout}_${dst_layout}" | |||
layout_name = SubstituteTemplate(layout_name, { | |||
'src_layout': ShortLayoutTypeNames[self.src.layout], | |||
'flt_layout': ShortLayoutTypeNames[self.flt.layout], | |||
'dst_layout': ShortLayoutTypeNames[self.dst.layout], | |||
}) | |||
return layout_name | |||
# | |||
def configuration_name(self): | |||
''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' | |||
opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class] | |||
warp_shape = [int(self.tile_description.threadblock_shape[idx] / self.tile_description.warp_count[idx]) for idx in range(3)] | |||
threadblock = "%dx%dx%d_%dx%dx%d_%d" % ( | |||
self.tile_description.threadblock_shape[0], | |||
self.tile_description.threadblock_shape[1], | |||
self.tile_description.threadblock_shape[2], | |||
warp_shape[0], | |||
warp_shape[1], | |||
warp_shape[2], | |||
self.tile_description.stages, | |||
) | |||
configuration_name = "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}" | |||
return SubstituteTemplate( | |||
configuration_name, | |||
{ | |||
'opcode_class': opcode_class_name, | |||
'extended_name': self.extended_name(), | |||
'threadblock': threadblock, | |||
'layout': self.layout_name(), | |||
} | |||
) | |||
# | |||
def procedural_name(self): | |||
''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' | |||
return self.configuration_name() | |||
################################################################################################### | |||
# | |||
# Emits single instances of a CUTLASS device-wide operator | |||
# | |||
################################################################################################### | |||
class EmitConv2dInstance: | |||
def __init__(self): | |||
self.template = """ | |||
// kernel instance "${operation_name}" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
${element_src}, | |||
${layout_src}, | |||
${element_flt}, | |||
${layout_flt}, | |||
${element_dst}, | |||
${layout_dst}, | |||
${element_bias}, | |||
${layout_bias}, | |||
${element_accumulator}, | |||
${conv_type}, | |||
${opcode_class}, | |||
${arch}, | |||
cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, | |||
cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, | |||
cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, | |||
${epilogue_functor}< | |||
${element_dst}, | |||
${epilogue_vector_length}, | |||
${element_accumulator}, | |||
${element_bias}, | |||
${element_epilogue} | |||
>, | |||
${swizzling_functor}, | |||
${stages}, | |||
${alignment_src}, | |||
${alignment_filter}, | |||
${nonuninity_kernel}, | |||
${math_operator}, | |||
${implicit_gemm_mode}>; | |||
""" | |||
def emit(self, operation): | |||
warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)] | |||
epilogue_vector_length = int(min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) / DataTypeSize[operation.dst.element]) | |||
values = { | |||
'operation_name': operation.procedural_name(), | |||
'conv_type': ConvTypeTag[operation.conv_type], | |||
'element_src': DataTypeTag[operation.src.element], | |||
'layout_src': LayoutTag[operation.src.layout], | |||
'element_flt': DataTypeTag[operation.flt.element], | |||
'layout_flt': LayoutTag[operation.flt.layout], | |||
'element_dst': DataTypeTag[operation.dst.element], | |||
'layout_dst': LayoutTag[operation.dst.layout], | |||
'element_bias': DataTypeTag[operation.bias.element], | |||
'layout_bias': LayoutTag[operation.bias.layout], | |||
'element_accumulator': DataTypeTag[operation.accumulator_type()], | |||
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], | |||
'arch': "cutlass::arch::Sm%d" % operation.arch, | |||
'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), | |||
'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), | |||
'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), | |||
'warp_shape_m': str(warp_shape[0]), | |||
'warp_shape_n': str(warp_shape[1]), | |||
'warp_shape_k': str(warp_shape[2]), | |||
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), | |||
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), | |||
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), | |||
'epilogue_vector_length': str(epilogue_vector_length), | |||
'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], | |||
'element_epilogue': str(DataTypeTag[operation.element_epilogue]), | |||
'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], | |||
'stages': str(operation.tile_description.stages), | |||
'alignment_src': str(operation.src.alignment), | |||
'alignment_filter': str(operation.flt.alignment), | |||
'nonuninity_kernel': str(operation.need_load_from_const).lower(), | |||
'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], | |||
'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] | |||
} | |||
return SubstituteTemplate(self.template, values) | |||
class EmitDeconvInstance: | |||
def __init__(self): | |||
self.template = """ | |||
// kernel instance "${operation_name}" generated by cutlass generator | |||
using Deconvolution = | |||
typename cutlass::conv::device::Deconvolution< | |||
${element_src}, | |||
${layout_src}, | |||
${element_flt}, | |||
${layout_flt}, | |||
${element_dst}, | |||
${layout_dst}, | |||
${element_bias}, | |||
${layout_bias}, | |||
${element_accumulator}, | |||
${opcode_class}, | |||
${arch}, | |||
cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, | |||
cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, | |||
cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, | |||
${epilogue_functor}< | |||
${element_dst}, | |||
${epilogue_vector_length}, | |||
${element_accumulator}, | |||
${element_bias}, | |||
${element_epilogue} | |||
>, | |||
${swizzling_functor}, | |||
${stages}, | |||
${alignment_src}, | |||
${alignment_filter}, | |||
${nonuninity_kernel}, | |||
${math_operator}, | |||
${implicit_gemm_mode}>; | |||
""" | |||
def emit(self, operation): | |||
warp_shape = [int(operation.tile_description.threadblock_shape[idx] / operation.tile_description.warp_count[idx]) for idx in range(3)] | |||
epilogue_vector_length = int(min(operation.dst.alignment * DataTypeSize[operation.dst.element], 128) / DataTypeSize[operation.dst.element]) | |||
values = { | |||
'operation_name': operation.procedural_name(), | |||
'element_src': DataTypeTag[operation.src.element], | |||
'layout_src': LayoutTag[operation.src.layout], | |||
'element_flt': DataTypeTag[operation.flt.element], | |||
'layout_flt': LayoutTag[operation.flt.layout], | |||
'element_dst': DataTypeTag[operation.dst.element], | |||
'layout_dst': LayoutTag[operation.dst.layout], | |||
'element_bias': DataTypeTag[operation.bias.element], | |||
'layout_bias': LayoutTag[operation.bias.layout], | |||
'element_accumulator': DataTypeTag[operation.accumulator_type()], | |||
'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], | |||
'arch': "cutlass::arch::Sm%d" % operation.arch, | |||
'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), | |||
'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), | |||
'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), | |||
'warp_shape_m': str(warp_shape[0]), | |||
'warp_shape_n': str(warp_shape[1]), | |||
'warp_shape_k': str(warp_shape[2]), | |||
'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), | |||
'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), | |||
'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), | |||
'epilogue_vector_length': str(epilogue_vector_length), | |||
'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], | |||
'element_epilogue': str(DataTypeTag[operation.element_epilogue]), | |||
'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], | |||
'stages': str(operation.tile_description.stages), | |||
'alignment_src': str(operation.src.alignment), | |||
'alignment_filter': str(operation.flt.alignment), | |||
'nonuninity_kernel': str(operation.need_load_from_const).lower(), | |||
'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation], | |||
'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode] | |||
} | |||
return SubstituteTemplate(self.template, values) | |||
################################################################################################### | |||
# | |||
# Generator functions for all layouts | |||
# | |||
################################################################################################### | |||
# | |||
def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \ | |||
skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNt): | |||
operations = [] | |||
element_epilogue = DataType.f32 | |||
if conv_kind == ConvKind.Fprop: | |||
if src_layout == LayoutType.TensorNHWC: | |||
swizzling_functor = SwizzlingFunctor.ConvFpropNHWC | |||
else: | |||
swizzling_functor = SwizzlingFunctor.ConvFpropNCxHWx | |||
else: | |||
swizzling_functor = SwizzlingFunctor.ConvDgradNCxHWx | |||
# skip rule | |||
def filter_tile_with_layout(tile: TileDescription, layout: LayoutType) -> bool: | |||
return layout == LayoutType.TensorNC32HW32 and \ | |||
tile.threadblock_shape[0] % 32 != 0 | |||
# rule for bias_type and epilogues | |||
def get_bias_type_and_epilogues(tile: TileDescription, \ | |||
out_dtype: DataType) -> Tuple[DataType, List[EpilogueFunctor]]: | |||
if tile.math_instruction.element_accumulator == DataType.s32 and \ | |||
out_dtype != DataType.f32: | |||
bias_type = DataType.s32 | |||
if tile.math_instruction.element_b == DataType.u4: | |||
epilogues = [EpilogueFunctor.BiasAddLinearCombinationClamp, EpilogueFunctor.BiasAddLinearCombinationReluClamp] | |||
else: | |||
epilogues = [EpilogueFunctor.BiasAddLinearCombinationClamp, EpilogueFunctor.BiasAddLinearCombinationReluClamp, \ | |||
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp] | |||
elif tile.math_instruction.element_accumulator == DataType.f32 or \ | |||
out_dtype == DataType.f32: | |||
bias_type = DataType.f32 | |||
epilogues = [EpilogueFunctor.BiasAddLinearCombination, EpilogueFunctor.BiasAddLinearCombinationRelu, \ | |||
EpilogueFunctor.BiasAddLinearCombinationHSwish] | |||
return bias_type, epilogues | |||
# rule for filter alignment | |||
def get_flt_align(tile: TileDescription) -> int: | |||
nonlocal flt_align | |||
if tile.math_instruction.opcode_class == OpcodeClass.Simt \ | |||
and tile.math_instruction.element_accumulator == DataType.s32: | |||
thread_num = tile.warp_count[0] * tile.warp_count[1] * tile.warp_count[2] * 32 | |||
flt_block = tile.threadblock_shape[0] * tile.threadblock_shape[2] \ | |||
* DataTypeSize[tile.math_instruction.element_a] | |||
load_per_thread = flt_block//thread_num | |||
if load_per_thread >= 128: | |||
flt_align = 128 | |||
elif load_per_thread >= 64: | |||
flt_align = 64 | |||
else: | |||
assert load_per_thread >= 32 | |||
flt_align = 32 | |||
return flt_align | |||
def get_dst_align(tile: TileDescription, out_layout: LayoutType) -> int: | |||
nonlocal dst_align | |||
if tile.math_instruction.opcode_class == OpcodeClass.TensorOp \ | |||
and dst_layout == LayoutType.TensorNC4HW4: | |||
dst_align = 32 | |||
return dst_align | |||
def filter_epilogue_with_conv_kind(epilogue: EpilogueFunctor, conv_kind: ConvKind) -> bool: | |||
return conv_kind == ConvKind.Dgrad \ | |||
and epilogue != EpilogueFunctor.BiasAddLinearCombinationClamp | |||
# loop over all tile descriptions | |||
for tile in tile_descriptions: | |||
if filter_tile_with_layout(tile, dst_layout): | |||
continue | |||
bias_type, epilogues = get_bias_type_and_epilogues(tile, dst_type) | |||
flt_align = get_flt_align(tile) | |||
dst_align = get_dst_align(tile, dst_layout) | |||
for epilogue in epilogues: | |||
if filter_epilogue_with_conv_kind(epilogue, conv_kind): | |||
continue | |||
if dst_type == DataType.f32: | |||
bias_type = DataType.f32 | |||
# | |||
src = TensorDescription(tile.math_instruction.element_b, src_layout, int(src_align / DataTypeSize[tile.math_instruction.element_b])) | |||
flt = TensorDescription(tile.math_instruction.element_a, flt_layout, int(flt_align / DataTypeSize[tile.math_instruction.element_a])) | |||
bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type]))) | |||
dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) | |||
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode) | |||
operations.append(new_operation) | |||
if not skip_unity_kernel: | |||
new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode) | |||
operations.append(new_operation) | |||
return operations | |||
################################################################################################### | |||
# | |||
# Emitters functions for all targets | |||
# | |||
################################################################################################### | |||
class EmitConv2dConfigurationLibrary: | |||
def __init__(self, operation_path, configuration_name): | |||
self.configuration_name = configuration_name | |||
self.configuration_path = os.path.join(operation_path, "%s.cu" % configuration_name) | |||
self.instance_emitter = EmitConv2dInstance() | |||
self.instance_template = """ | |||
${operation_instance} | |||
// Derived class | |||
struct ${operation_name} : | |||
public ${operation_name}_base { }; | |||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||
""" | |||
self.header_template = """ | |||
/* | |||
Generated by conv2d_operation.py - Do not edit. | |||
*/ | |||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||
#include "cutlass/cutlass.h" | |||
#include "cutlass/library/library.h" | |||
#include "cutlass/library/manifest.h" | |||
#include "library_internal.h" | |||
#include "conv2d_operation.h" | |||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||
""" | |||
self.configuration_header = """ | |||
namespace cutlass { | |||
namespace library { | |||
// Initialize all instances | |||
void initialize_${configuration_name}(Manifest &manifest) { | |||
""" | |||
self.configuration_instance = """ | |||
using Operation_${operation_name} = cutlass::conv::device::ImplicitGemmConvolution< | |||
${operation_name}>; | |||
manifest.append(new cutlass::library::Conv2dOperation< | |||
Operation_${operation_name}>( | |||
"${operation_name}")); | |||
""" | |||
self.configuration_epilogue = """ | |||
} | |||
""" | |||
self.epilogue_template = """ | |||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||
} // namespace library | |||
} // namespace cutlass | |||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||
""" | |||
# | |||
def __enter__(self): | |||
self.configuration_file = open(self.configuration_path, "w") | |||
self.configuration_file.write(SubstituteTemplate(self.header_template, { | |||
'configuration_name': self.configuration_name | |||
})) | |||
self.operations = [] | |||
return self | |||
# | |||
def emit(self, operation): | |||
self.operations.append(operation) | |||
self.configuration_file.write(SubstituteTemplate(self.instance_template, { | |||
'configuration_name': self.configuration_name, | |||
'operation_name': operation.procedural_name(), | |||
'operation_instance': self.instance_emitter.emit(operation) | |||
})) | |||
# | |||
def __exit__(self, exception_type, exception_value, traceback): | |||
self.configuration_file.write(SubstituteTemplate(self.configuration_header, { | |||
'configuration_name': self.configuration_name | |||
})) | |||
for operation in self.operations: | |||
self.configuration_file.write(SubstituteTemplate(self.configuration_instance, { | |||
'configuration_name': self.configuration_name, | |||
'operation_name': operation.procedural_name() | |||
})) | |||
self.configuration_file.write(self.configuration_epilogue) | |||
self.configuration_file.write(self.epilogue_template) | |||
self.configuration_file.close() | |||
################################################################################################### | |||
################################################################################################### | |||
# Emitters for Conv Kernel Wrapper | |||
# | |||
################################################################################################### | |||
class EmitConvSingleKernelWrapper(): | |||
def __init__(self, kernel_path, operation, wrapper_path): | |||
self.kernel_path = kernel_path | |||
self.wrapper_path = wrapper_path | |||
self.operation = operation | |||
self.conv_wrappers = { \ | |||
ConvKind.Fprop: """ | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
""", \ | |||
ConvKind.Dgrad: """ | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper<Deconvolution>( | |||
const typename Deconvolution::ElementSrc* d_src, | |||
const typename Deconvolution::ElementFilter* d_filter, | |||
const typename Deconvolution::ElementBias* d_bias, | |||
const typename Deconvolution::ElementDst* d_z, | |||
typename Deconvolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Deconvolution::ConvolutionParameter const& conv_param, | |||
typename Deconvolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream); | |||
""", \ | |||
} | |||
if self.operation.conv_kind == ConvKind.Fprop: | |||
self.instance_emitter = EmitConv2dInstance() | |||
else: | |||
assert self.operation.conv_kind == ConvKind.Dgrad | |||
self.instance_emitter = EmitDeconvInstance() | |||
self.header_template = """ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "${wrapper_path}" | |||
""" | |||
self.instance_template = """ | |||
${operation_instance} | |||
""" | |||
self.wrapper_template = """ | |||
${wrapper_instance} | |||
""" | |||
self.epilogue_template = """ | |||
#pragma GCC diagnostic pop | |||
#endif | |||
""" | |||
# | |||
def __enter__(self): | |||
self.kernel_path = os.path.join(self.kernel_path, "%s.cu" % self.operation.procedural_name()) | |||
self.kernel_file = LazyFile(self.kernel_path) | |||
self.kernel_file.write(SubstituteTemplate(self.header_template, { | |||
'wrapper_path': self.wrapper_path, | |||
})) | |||
return self | |||
# | |||
def emit(self): | |||
self.kernel_file.write(SubstituteTemplate(self.instance_template, { | |||
'operation_instance': self.instance_emitter.emit(self.operation), | |||
})) | |||
# emit wrapper | |||
wrapper = SubstituteTemplate(self.wrapper_template, { | |||
'wrapper_instance': self.conv_wrappers[self.operation.conv_kind], | |||
}) | |||
self.kernel_file.write(wrapper) | |||
# | |||
def __exit__(self, exception_type, exception_value, traceback): | |||
self.kernel_file.write(self.epilogue_template) | |||
self.kernel_file.close() | |||
################################################################################################### | |||
################################################################################################### | |||
@@ -0,0 +1,38 @@ | |||
from generator import ( | |||
GenerateGemmOperations, | |||
GenerateGemvOperations, | |||
GenerateConv2dOperations, | |||
GenerateDeconvOperations, | |||
) | |||
class GenArg: | |||
def __init__(self, gen_op, gen_type): | |||
self.operations = gen_op | |||
self.type = gen_type | |||
def write_op_list(f, gen_op, gen_type): | |||
if gen_op == "gemm": | |||
operations = GenerateGemmOperations(GenArg(gen_op, gen_type)) | |||
elif gen_op == "gemv": | |||
operations = GenerateGemvOperations(GenArg(gen_op, gen_type)) | |||
elif gen_op == "conv2d": | |||
operations = GenerateConv2dOperations(GenArg(gen_op, gen_type)) | |||
elif gen_op == "deconv": | |||
operations = GenerateDeconvOperations(GenArg(gen_op, gen_type)) | |||
for op in operations: | |||
f.write(' "%s.cu",\n' % op.procedural_name()) | |||
if __name__ == "__main__": | |||
with open("list.bzl", "w") as f: | |||
f.write("# Generated by dnn/scripts/cutlass_generator/gen_list.py\n\n") | |||
f.write("cutlass_gen_list = [\n") | |||
write_op_list(f, "gemm", "simt") | |||
write_op_list(f, "gemv", "simt") | |||
write_op_list(f, "deconv", "simt") | |||
write_op_list(f, "conv2d", "simt") | |||
write_op_list(f, "conv2d", "tensorop8816") | |||
write_op_list(f, "conv2d", "tensorop8832") | |||
f.write("]") |
@@ -0,0 +1,651 @@ | |||
# | |||
# \file generator.py | |||
# | |||
# \brief Generates the CUTLASS Library's instances | |||
# | |||
import enum | |||
import os.path | |||
import shutil | |||
import argparse | |||
from library import * | |||
from manifest import * | |||
################################################################################################### | |||
# | |||
def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0): | |||
# by default, use the latest CUDA Toolkit version | |||
cuda_version = [11, 0, 132] | |||
# Update cuda_version based on parsed string | |||
if semantic_ver_string != '': | |||
for i, x in enumerate([int(x) for x in semantic_ver_string.split('.')]): | |||
if i < len(cuda_version): | |||
cuda_version[i] = x | |||
else: | |||
cuda_version.append(x) | |||
return cuda_version >= [major, minor, patch] | |||
################################################################################################### | |||
################################################################################################### | |||
# | |||
def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \ | |||
alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \ | |||
swizzling_functor = SwizzlingFunctor.Identity8): | |||
if complex_transforms is None: | |||
complex_transforms = [(ComplexTransform.none, ComplexTransform.none),] | |||
element_a, element_b, element_c, element_epilogue = data_type | |||
operations = [] | |||
# by default, only generate the largest tile and largest alignment | |||
if manifest.args.kernels == '': | |||
tile_descriptions = [tile_descriptions[0],] | |||
alignment_constraints = [alignment_constraints[0],] | |||
for layout in layouts: | |||
for tile_description in tile_descriptions: | |||
for alignment in alignment_constraints: | |||
for complex_transform in complex_transforms: | |||
alignment_c = min(8, alignment) | |||
A = TensorDescription(element_a, layout[0], alignment, complex_transform[0]) | |||
B = TensorDescription(element_b, layout[1], alignment, complex_transform[1]) | |||
C = TensorDescription(element_c, layout[2], alignment_c) | |||
new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \ | |||
tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor) | |||
manifest.append(new_operation) | |||
operations.append(new_operation) | |||
return operations | |||
########################################################################################################### | |||
# ConvolutionOperator support variations | |||
# ____________________________________________________________________ | |||
# ConvolutionalOperator | Analytic | Optimized | |||
# ____________________________________________________________________ | |||
# | Fprop | (strided) | (strided) | |||
# | Dgrad | (strided, unity*) | (unity) | |||
# | Wgrad | (strided) | (strided) | |||
# ____________________________________________________________________ | |||
# | |||
# Note : Operator marked (*) are supported but not generated to keep the instantiated kernel count low | |||
########################################################################################################### | |||
# Convolution for 2D operations | |||
def CreateConv2dOperator(manifest, layout, tile_descriptions, data_type, alignment, \ | |||
conv_kinds = [ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad], epilogue_functor = EpilogueFunctor.LinearCombination): | |||
element_a, element_b, element_c, element_epilogue = data_type | |||
# one exceptional case | |||
alignment_c = min(8, alignment) | |||
# iterator algorithm (analytic and optimized) | |||
iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized] | |||
# by default, only generate the largest tile size | |||
if manifest.args.kernels == '': | |||
tile_descriptions = [tile_descriptions[0],] | |||
operations = [] | |||
for tile in tile_descriptions: | |||
for conv_kind in conv_kinds: | |||
for iterator_algorithm in iterator_algorithms: | |||
A = TensorDescription(element_a, layout[0], alignment) | |||
B = TensorDescription(element_b, layout[1], alignment) | |||
C = TensorDescription(element_c, layout[2], alignment_c) | |||
# unity stride only for Optimized Dgrad | |||
if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad): | |||
new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ | |||
A, B, C, element_epilogue, StrideSupport.Unity, epilogue_functor) | |||
manifest.append(new_operation) | |||
operations.append(new_operation) | |||
# strided dgrad is not supported by Optimized Dgrad | |||
if (iterator_algorithm == IteratorAlgorithm.Optimized) and (conv_kind == ConvKind.Dgrad): | |||
continue | |||
# strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic) | |||
new_operation = Conv2dOperation(conv_kind, iterator_algorithm, tile.minimum_compute_capability, tile,\ | |||
A, B, C, element_epilogue, StrideSupport.Strided, epilogue_functor) | |||
manifest.append(new_operation) | |||
operations.append(new_operation) | |||
return operations | |||
################################################################################################### | |||
################################################################################################### | |||
def GenerateConv2d_Simt(args): | |||
operations = [] | |||
layouts = [ | |||
(LayoutType.TensorNC4HW4, LayoutType.TensorC4RSK4), | |||
] | |||
math_instructions = [ | |||
MathInstruction( \ | |||
[1, 1, 4], \ | |||
DataType.s8, DataType.s8, DataType.s32, \ | |||
OpcodeClass.Simt, \ | |||
MathOperation.multiply_add), | |||
] | |||
dst_layouts = [ | |||
LayoutType.TensorNC4HW4, | |||
LayoutType.TensorNC32HW32, | |||
LayoutType.TensorNHWC, | |||
LayoutType.TensorNHWC, | |||
LayoutType.TensorNCHW | |||
] | |||
dst_types = [ | |||
DataType.s8, | |||
DataType.s8, | |||
DataType.u4, | |||
DataType.s4, | |||
DataType.f32, | |||
] | |||
max_cc = 1024 | |||
for math_inst in math_instructions: | |||
for layout in layouts: | |||
for dst_type, dst_layout in zip(dst_types, dst_layouts): | |||
if dst_type == DataType.s4 or dst_type == DataType.u4: | |||
min_cc = 75 | |||
skip_unity_kernel = True | |||
else: | |||
min_cc = 61 | |||
skip_unity_kernel = False | |||
tile_descriptions = [ | |||
TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 64, 64, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 32, 64, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 32, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
] | |||
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||
dst_layout, dst_type, min_cc, 32, 32, 32, | |||
skip_unity_kernel) | |||
return operations | |||
def GenerateConv2d_TensorOp_8816(args): | |||
operations = [] | |||
layouts = [ | |||
(LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32), | |||
] | |||
math_instructions = [ | |||
MathInstruction( \ | |||
[8, 8, 16], \ | |||
DataType.s8, DataType.s8, DataType.s32, \ | |||
OpcodeClass.TensorOp, \ | |||
MathOperation.multiply_add_saturate), | |||
] | |||
dst_layouts = [ | |||
LayoutType.TensorNC32HW32, | |||
LayoutType.TensorNC4HW4, | |||
] | |||
dst_types = [ | |||
DataType.s8, | |||
DataType.s8, | |||
] | |||
min_cc = 75 | |||
max_cc = 1024 | |||
for math_inst in math_instructions: | |||
for layout in layouts: | |||
for dst_type, dst_layout in zip(dst_types, dst_layouts): | |||
if dst_layout == LayoutType.TensorNC32HW32: | |||
tile_descriptions = [ | |||
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 32, 64, 64], 2, [1, 4, 1], math_inst, min_cc, max_cc), | |||
] | |||
else: | |||
assert dst_layout == LayoutType.TensorNC4HW4 | |||
tile_descriptions = [ | |||
TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 64, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 32, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
] | |||
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||
dst_layout, dst_type, min_cc, 128, 128, 64, | |||
False) | |||
return operations | |||
def GenerateConv2d_TensorOp_8832(args): | |||
operations = [] | |||
layouts = [ | |||
(LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64), | |||
] | |||
math_instructions = [ | |||
MathInstruction( \ | |||
[8, 8, 32], \ | |||
DataType.s4, DataType.s4, DataType.s32, \ | |||
OpcodeClass.TensorOp, \ | |||
MathOperation.multiply_add_saturate), \ | |||
MathInstruction( \ | |||
[8, 8, 32], \ | |||
DataType.s4, DataType.u4, DataType.s32, \ | |||
OpcodeClass.TensorOp, \ | |||
MathOperation.multiply_add_saturate) | |||
] | |||
dst_layouts = [ | |||
LayoutType.TensorNC64HW64, | |||
] | |||
min_cc = 75 | |||
max_cc = 1024 | |||
for math_inst in math_instructions: | |||
for layout in layouts: | |||
for dst_layout in dst_layouts: | |||
dst_type = math_inst.element_b | |||
tile_descriptions = [ | |||
TileDescription([256, 128, 128], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
] | |||
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||
dst_layout, dst_type, min_cc, 128, 128, 64, | |||
True) | |||
layouts_nhwc = [ | |||
(LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), | |||
(LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 64), | |||
(LayoutType.TensorNHWC, LayoutType.TensorNC32HW32, 128), | |||
] | |||
dst_layouts_nhwc = [ | |||
LayoutType.TensorNHWC, | |||
] | |||
for math_inst in math_instructions: | |||
for layout in layouts_nhwc: | |||
for dst_layout in dst_layouts_nhwc: | |||
dst_type = math_inst.element_b | |||
tile_descriptions = [ | |||
TileDescription([128, 32, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 64, 64], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||
] | |||
operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], | |||
dst_layout, dst_type, min_cc, layout[2], layout[2], 32, | |||
False, ImplicitGemmMode.GemmTn) | |||
return operations | |||
def GenerateDeconv_Simt(args): | |||
operations = [] | |||
layouts = [ | |||
(LayoutType.TensorNC4HW4, LayoutType.TensorK4RSC4), | |||
] | |||
math_instructions = [ | |||
MathInstruction( \ | |||
[1, 1, 4], \ | |||
DataType.s8, DataType.s8, DataType.s32, \ | |||
OpcodeClass.Simt, \ | |||
MathOperation.multiply_add), | |||
] | |||
dst_layouts = [ | |||
LayoutType.TensorNC4HW4, | |||
] | |||
dst_types = [ | |||
DataType.s8, | |||
] | |||
min_cc = 61 | |||
max_cc = 1024 | |||
for math_inst in math_instructions: | |||
for layout in layouts: | |||
for dst_type, dst_layout in zip(dst_types, dst_layouts): | |||
tile_descriptions = [ | |||
TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc), | |||
TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
] | |||
operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1], | |||
dst_layout, dst_type, min_cc, 32, 32, 32, | |||
True) | |||
return operations | |||
################################################################################ | |||
# parameters | |||
# Edge - for tiles, the edges represent the length of one side | |||
# Ratio - the maximum ratio between 2 edges, limits the skinnyness of tiles | |||
# MaxEdge - maximum length of each edge | |||
# Min/Max - minimum/maximum of the product of edge lengths | |||
################################################################################ | |||
warpsPerThreadblockEdge = [1, 2, 4, 8, 16] | |||
warpsPerThreadblockRatio = 2 | |||
warpsPerThreadblockMax = 16 | |||
# NOTE 1x32 and 2x16 warp tile shapes fail validation for ~10% of cases | |||
warpShapeEdges = [8, 16, 32, 64, 128, 256] | |||
warpShapeRatio = 4 | |||
warpShapeMax = 64*64 | |||
warpShapeMin = 8*8 | |||
threadblockEdgeMax = 256 | |||
# char, type bits/elem, max tile, L0 threadblock tiles | |||
precisions = { | |||
"c" : [ "cutlass::complex<float>", 64, 64*128, [ [ 64, 128], [ 64, 32] ] ], | |||
"d" : [ "double", 64, 64*64, [ [ 64, 64], [ 32, 32] ] ], | |||
"h" : [ "cutlass::half_t", 16, 128*256, [ [256, 128], [ 64, 128], [ 64, 32] ] ], | |||
"i" : [ "int", 32, 128*128, [ [128, 64], [ 16, 32] ] ], | |||
"s" : [ "float", 32, 128*128, [ [128, 256], [128, 128], [ 64, 64] ] ], | |||
"z" : [ "cutlass::complex<double>", 128, 64*64, [ [ 32, 64], [ 16, 32] ] ], | |||
} | |||
# L1 will have a single kernel for every unique shape | |||
# L2 will have everything else | |||
def GenerateGemm_Simt(args): | |||
################################################################################ | |||
# warps per threadblock | |||
################################################################################ | |||
warpsPerThreadblocks = [] | |||
for warpsPerThreadblock0 in warpsPerThreadblockEdge: | |||
for warpsPerThreadblock1 in warpsPerThreadblockEdge: | |||
if warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio \ | |||
and warpsPerThreadblock1 / warpsPerThreadblock0 <= warpsPerThreadblockRatio \ | |||
and warpsPerThreadblock0 * warpsPerThreadblock1 <= warpsPerThreadblockMax: | |||
warpsPerThreadblocks.append([warpsPerThreadblock0, | |||
warpsPerThreadblock1]) | |||
################################################################################ | |||
# warp shapes | |||
################################################################################ | |||
warpNumThreads = 32 | |||
warpShapes = [] | |||
for warp0 in warpShapeEdges: | |||
for warp1 in warpShapeEdges: | |||
if warp0 / warp1 <= warpShapeRatio \ | |||
and warp1 / warp0 <= warpShapeRatio \ | |||
and warp0 * warp1 <= warpShapeMax \ | |||
and warp0*warp1 > warpShapeMin: | |||
warpShapes.append([warp0, warp1]) | |||
# sgemm | |||
precisionType, precisionBits, threadblockMaxElements, threadblockTilesL0 = precisions["s"] | |||
layouts = [ | |||
(LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # nn | |||
(LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor), # nt | |||
(LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # tn | |||
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), # tt | |||
] | |||
math_instructions = [ | |||
MathInstruction( \ | |||
[1, 1, 1], \ | |||
DataType.f32, DataType.f32, DataType.f32, \ | |||
OpcodeClass.Simt, \ | |||
MathOperation.multiply_add), | |||
] | |||
min_cc = 50 | |||
max_cc = 1024 | |||
operations = [] | |||
for math_inst in math_instructions: | |||
for layout in layouts: | |||
data_type = [ | |||
math_inst.element_a, | |||
math_inst.element_b, | |||
math_inst.element_accumulator, | |||
math_inst.element_accumulator, | |||
] | |||
tile_descriptions = [ | |||
TileDescription([64, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||
TileDescription([256, 64, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 32, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc), | |||
TileDescription([256, 32, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 32, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 64, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 32, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 8, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 16, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc), | |||
TileDescription([ 16, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc), | |||
] | |||
for warpsPerThreadblock in warpsPerThreadblocks: | |||
for warpShape in warpShapes: | |||
warpThreadsM = 0 | |||
if warpShape[0] > warpShape[1]: | |||
warpThreadsM = 8 | |||
else: | |||
warpThreadsM = 4 | |||
warpThreadsN = warpNumThreads / warpThreadsM | |||
# skip shapes with conflicting rectangularity | |||
# they are unlikely to be fastest | |||
blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1] | |||
blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1] | |||
warpG = warpShape[0] > warpShape[1] | |||
warpL = warpShape[0] < warpShape[1] | |||
blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1]*2 | |||
blockL2 = warpsPerThreadblock[0]*2 < warpsPerThreadblock[1] | |||
warpG2 = warpShape[0] > warpShape[1]*2 | |||
warpL2 = warpShape[0]*2 < warpShape[1] | |||
if blockG2 and warpL: continue | |||
if blockL2 and warpG: continue | |||
if warpG2 and blockL: continue | |||
if warpL2 and blockG: continue | |||
# check threadblock ratios and max | |||
threadblockTile = [warpShape[0]*warpsPerThreadblock[0], | |||
warpShape[1]*warpsPerThreadblock[1]] | |||
if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements: continue | |||
if threadblockTile[0] > threadblockEdgeMax: continue | |||
if threadblockTile[1] > threadblockEdgeMax: continue | |||
totalThreads = warpNumThreads*warpsPerThreadblock[0]*warpsPerThreadblock[1] | |||
# calculate unroll | |||
# ensure that every iteration at least a full load of A,B are done | |||
unrollMin = 8 | |||
unrollMin0 = totalThreads // threadblockTile[0] | |||
unrollMin1 = totalThreads // threadblockTile[1] | |||
unroll = max(unrollMin, unrollMin0, unrollMin1) | |||
threadTileM = warpShape[0] // warpThreadsM | |||
threadTileN = warpShape[1] // warpThreadsN | |||
if threadTileM < 2 or threadTileN < 2: continue | |||
if threadTileM*threadTileN*precisionBits > 8*8*32: continue | |||
# epilogue currently only supports N < WarpNumThreads | |||
if threadblockTile[1] < warpNumThreads: continue | |||
# limit smem | |||
smemBitsA = threadblockTile[0]*unroll*2*precisionBits | |||
smemBitsB = threadblockTile[1]*unroll*2*precisionBits | |||
smemKBytes = (smemBitsA+smemBitsB)/8/1024 | |||
if (smemKBytes > 48): continue | |||
tile = TileDescription([threadblockTile[0], threadblockTile[1], unroll], \ | |||
2, \ | |||
[threadblockTile[0]//warpShape[0], threadblockTile[1]//warpShape[1], 1], \ | |||
math_inst, min_cc, max_cc) | |||
def filter(t: TileDescription) -> bool: | |||
nonlocal tile | |||
return t.threadblock_shape[0] == tile.threadblock_shape[0] and \ | |||
t.threadblock_shape[1] == tile.threadblock_shape[1] and \ | |||
t.threadblock_shape[2] == tile.threadblock_shape[2] and \ | |||
t.warp_count[0] == tile.warp_count[0] and \ | |||
t.warp_count[1] == tile.warp_count[1] and \ | |||
t.warp_count[2] == tile.warp_count[2] and \ | |||
t.stages == tile.stages | |||
if not any(t for t in tile_descriptions if filter(t)): continue | |||
operations += GeneratesGemm(tile, data_type, layout[0], layout[1], layout[2], min_cc) | |||
return operations | |||
# | |||
def GenerateGemv_Simt(args): | |||
threadBlockShape_N = [128, 64, 32] | |||
ldgBits_A = [128, 64, 32] | |||
ldgBits_B = [128, 64, 32] | |||
layouts = [ | |||
(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), | |||
] | |||
math_instructions = [ | |||
MathInstruction( \ | |||
[1, 1, 1], \ | |||
DataType.f32, DataType.f32, DataType.f32, \ | |||
OpcodeClass.Simt, \ | |||
MathOperation.multiply_add), | |||
] | |||
min_cc = 50 | |||
operations = [] | |||
for math_inst in math_instructions: | |||
for layout in layouts: | |||
data_type = [ | |||
math_inst.element_a, | |||
math_inst.element_b, | |||
math_inst.element_accumulator, | |||
math_inst.element_accumulator, | |||
] | |||
for threadblock_shape_n in threadBlockShape_N: | |||
for align_a in ldgBits_A: | |||
for align_b in ldgBits_B: | |||
ldg_elements_a = align_a // DataTypeSize[math_inst.element_a] | |||
ldg_elements_b = align_b // DataTypeSize[math_inst.element_b] | |||
threadblock_shape_k = (256 * ldg_elements_a) // (threadblock_shape_n // ldg_elements_b) | |||
threadblock_shape = [1, threadblock_shape_n, threadblock_shape_k] | |||
thread_shape = [1, ldg_elements_b, ldg_elements_a] | |||
operations.append(GeneratesGemv(math_inst, \ | |||
threadblock_shape, \ | |||
thread_shape, \ | |||
data_type, \ | |||
layout[0], \ | |||
layout[1], \ | |||
layout[2], \ | |||
min_cc, \ | |||
align_a, \ | |||
align_b)) | |||
return operations | |||
# | |||
def GenerateConv2dOperations(args): | |||
if args.type == "simt": | |||
return GenerateConv2d_Simt(args) | |||
elif args.type == "tensorop8816": | |||
return GenerateConv2d_TensorOp_8816(args) | |||
else: | |||
assert args.type == "tensorop8832", "operation conv2d only support" \ | |||
"simt, tensorop8816 and tensorop8832. (got:{})".format(args.type) | |||
return GenerateConv2d_TensorOp_8832(args) | |||
def GenerateDeconvOperations(args): | |||
assert args.type == "simt", "operation deconv only support" \ | |||
"simt. (got:{})".format(args.type) | |||
return GenerateDeconv_Simt(args) | |||
def GenerateGemmOperations(args): | |||
assert args.type == "simt", "operation gemm only support" \ | |||
"simt. (got:{})".format(args.type) | |||
return GenerateGemm_Simt(args) | |||
def GenerateGemvOperations(args): | |||
assert args.type == "simt", "operation gemv only support" \ | |||
"simt. (got:{})".format(args.type) | |||
return GenerateGemv_Simt(args) | |||
################################################################################################### | |||
################################################################################################### | |||
if __name__ == "__main__": | |||
parser = argparse.ArgumentParser(description="Generates device kernel registration code for CUTLASS Kernels") | |||
parser.add_argument("--operations", type=str, choices=['gemm', 'gemv', 'conv2d', 'deconv'], | |||
required=True, help="Specifies the operation to generate (gemm, gemv, conv2d, deconv)") | |||
parser.add_argument("output", type=str, help="output directory for CUTLASS kernel files") | |||
parser.add_argument("--type", type=str, choices=['simt', 'tensorop8816', 'tensorop8832'], | |||
default='simt', help="kernel type of CUTLASS kernel generator") | |||
operation2wrapper_path = { | |||
"gemm": "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuinl", \ | |||
"gemv": "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl", \ | |||
"conv2d": "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl", \ | |||
"deconv": "src/cuda/convolution/backward_data/implicit_gemm_deconv_cutlass_wrapper.cuinl", \ | |||
} | |||
args = parser.parse_args() | |||
wrapper_path = operation2wrapper_path[args.operations] | |||
if args.operations == "gemm": | |||
operations = GenerateGemmOperations(args) | |||
elif args.operations == "gemv": | |||
operations = GenerateGemvOperations(args) | |||
elif args.operations == "conv2d": | |||
operations = GenerateConv2dOperations(args) | |||
elif args.operations == "deconv": | |||
operations = GenerateDeconvOperations(args) | |||
if args.operations == "conv2d" or args.operations == "deconv": | |||
for operation in operations: | |||
with EmitConvSingleKernelWrapper(args.output, operation, wrapper_path) as emitter: | |||
emitter.emit() | |||
elif args.operations == "gemm" or args.operations == "gemv": | |||
for operation in operations: | |||
with EmitGemmSingleKernelWrapper(args.output, operation, wrapper_path) as emitter: | |||
emitter.emit() | |||
# | |||
################################################################################################### |
@@ -0,0 +1,27 @@ | |||
# | |||
# \file lazy_file.py | |||
# | |||
# \brief LazyFile updates the target file only when the content is changed | |||
# in order to avoid generating new cutlass kimpls each time cmake is called | |||
# | |||
import io | |||
import os | |||
class LazyFile: | |||
def __init__(self, filename): | |||
self.filename = filename | |||
self.buffer = io.StringIO() | |||
def write(self, data): | |||
self.buffer.write(str(data)) | |||
def close(self): | |||
if os.path.isfile(self.filename): | |||
old_data = open(self.filename).read() | |||
else: | |||
old_data = "" | |||
new_data = self.buffer.getvalue() | |||
if old_data != new_data: | |||
with open(self.filename, "w") as f: | |||
f.write(new_data) |
@@ -0,0 +1,614 @@ | |||
# | |||
# \file generator.py | |||
# | |||
# \brief Generates the CUTLASS Library's instances | |||
# | |||
import re | |||
################################################################################################### | |||
import enum | |||
# The following block implements enum.auto() for Python 3.5 variants that don't include it such | |||
# as the default 3.5.2 on Ubuntu 16.04. | |||
# | |||
# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility | |||
try: | |||
from enum import auto as enum_auto | |||
except ImportError: | |||
__cutlass_library_auto_enum = 0 | |||
def enum_auto() -> int: | |||
global __cutlass_library_auto_enum | |||
i = __cutlass_library_auto_enum | |||
__cutlass_library_auto_enum += 1 | |||
return i | |||
################################################################################################### | |||
# | |||
class GeneratorTarget(enum.Enum): | |||
Library = enum_auto() | |||
# | |||
GeneratorTargetNames = { | |||
GeneratorTarget.Library: 'library' | |||
} | |||
# | |||
################################################################################################### | |||
# | |||
class DataType(enum.Enum): | |||
b1 = enum_auto() | |||
u4 = enum_auto() | |||
u8 = enum_auto() | |||
u16 = enum_auto() | |||
u32 = enum_auto() | |||
u64 = enum_auto() | |||
s4 = enum_auto() | |||
s8 = enum_auto() | |||
s16 = enum_auto() | |||
s32 = enum_auto() | |||
s64 = enum_auto() | |||
f16 = enum_auto() | |||
bf16 = enum_auto() | |||
f32 = enum_auto() | |||
tf32 = enum_auto() | |||
f64 = enum_auto() | |||
cf16 = enum_auto() | |||
cbf16 = enum_auto() | |||
cf32 = enum_auto() | |||
ctf32 = enum_auto() | |||
cf64 = enum_auto() | |||
cs4 = enum_auto() | |||
cs8 = enum_auto() | |||
cs16 = enum_auto() | |||
cs32 = enum_auto() | |||
cs64 = enum_auto() | |||
cu4 = enum_auto() | |||
cu8 = enum_auto() | |||
cu16 = enum_auto() | |||
cu32 = enum_auto() | |||
cu64 = enum_auto() | |||
invalid = enum_auto() | |||
# | |||
ShortDataTypeNames = { | |||
DataType.s32: 'i', | |||
DataType.f16: 'h', | |||
DataType.f32: 's', | |||
DataType.f64: 'd', | |||
DataType.cf32: 'c', | |||
DataType.cf64: 'z', | |||
} | |||
# | |||
DataTypeNames = { | |||
DataType.b1: "b1", | |||
DataType.u4: "u4", | |||
DataType.u8: "u8", | |||
DataType.u16: "u16", | |||
DataType.u32: "u32", | |||
DataType.u64: "u64", | |||
DataType.s4: "s4", | |||
DataType.s8: "s8", | |||
DataType.s16: "s16", | |||
DataType.s32: "s32", | |||
DataType.s64: "s64", | |||
DataType.f16: "f16", | |||
DataType.bf16: "bf16", | |||
DataType.f32: "f32", | |||
DataType.tf32: "tf32", | |||
DataType.f64: "f64", | |||
DataType.cf16: "cf16", | |||
DataType.cbf16: "cbf16", | |||
DataType.cf32: "cf32", | |||
DataType.ctf32: "ctf32", | |||
DataType.cf64: "cf64", | |||
DataType.cu4: "cu4", | |||
DataType.cu8: "cu8", | |||
DataType.cu16: "cu16", | |||
DataType.cu32: "cu32", | |||
DataType.cu64: "cu64", | |||
DataType.cs4: "cs4", | |||
DataType.cs8: "cs8", | |||
DataType.cs16: "cs16", | |||
DataType.cs32: "cs32", | |||
DataType.cs64: "cs64", | |||
} | |||
DataTypeTag = { | |||
DataType.b1: "cutlass::uint1b_t", | |||
DataType.u4: "cutlass::uint4b_t", | |||
DataType.u8: "uint8_t", | |||
DataType.u16: "uint16_t", | |||
DataType.u32: "uint32_t", | |||
DataType.u64: "uint64_t", | |||
DataType.s4: "cutlass::int4b_t", | |||
DataType.s8: "int8_t", | |||
DataType.s16: "int16_t", | |||
DataType.s32: "int32_t", | |||
DataType.s64: "int64_t", | |||
DataType.f16: "cutlass::half_t", | |||
DataType.bf16: "cutlass::bfloat16_t", | |||
DataType.f32: "float", | |||
DataType.tf32: "cutlass::tfloat32_t", | |||
DataType.f64: "double", | |||
DataType.cf16: "cutlass::complex<cutlass::half_t>", | |||
DataType.cbf16: "cutlass::complex<cutlass::bfloat16_t>", | |||
DataType.cf32: "cutlass::complex<float>", | |||
DataType.ctf32: "cutlass::complex<cutlass::tfloat32_t>", | |||
DataType.cf64: "cutlass::complex<double>", | |||
DataType.cu4: "cutlass::complex<cutlass::uint4b_t>", | |||
DataType.cu8: "cutlass::complex<cutlass::uint8_t>", | |||
DataType.cu16: "cutlass::complex<cutlass::uint16_t>", | |||
DataType.cu32: "cutlass::complex<cutlass::uint32_t>", | |||
DataType.cu64: "cutlass::complex<cutlass::uint64_t>", | |||
DataType.cs4: "cutlass::complex<cutlass::int4b_t>", | |||
DataType.cs8: "cutlass::complex<cutlass::int8_t>", | |||
DataType.cs16: "cutlass::complex<cutlass::int16_t>", | |||
DataType.cs32: "cutlass::complex<cutlass::int32_t>", | |||
DataType.cs64: "cutlass::complex<cutlass::int64_t>", | |||
} | |||
DataTypeSize = { | |||
DataType.b1: 1, | |||
DataType.u4: 4, | |||
DataType.u8: 4, | |||
DataType.u16: 16, | |||
DataType.u32: 32, | |||
DataType.u64: 64, | |||
DataType.s4: 4, | |||
DataType.s8: 8, | |||
DataType.s16: 16, | |||
DataType.s32: 32, | |||
DataType.s64: 64, | |||
DataType.f16: 16, | |||
DataType.bf16: 16, | |||
DataType.f32: 32, | |||
DataType.tf32: 32, | |||
DataType.f64: 64, | |||
DataType.cf16: 32, | |||
DataType.cbf16: 32, | |||
DataType.cf32: 64, | |||
DataType.ctf32: 32, | |||
DataType.cf64: 128, | |||
DataType.cu4: 8, | |||
DataType.cu8: 16, | |||
DataType.cu16: 32, | |||
DataType.cu32: 64, | |||
DataType.cu64: 128, | |||
DataType.cs4: 8, | |||
DataType.cs8: 16, | |||
DataType.cs16: 32, | |||
DataType.cs32: 64, | |||
DataType.cs64: 128, | |||
} | |||
################################################################################################### | |||
# | |||
class ComplexTransform(enum.Enum): | |||
none = enum_auto() | |||
conj = enum_auto() | |||
# | |||
ComplexTransformTag = { | |||
ComplexTransform.none: 'cutlass::ComplexTransform::kNone', | |||
ComplexTransform.conj: 'cutlass::ComplexTransform::kConjugate', | |||
} | |||
# | |||
RealComplexBijection = [ | |||
(DataType.f16, DataType.cf16), | |||
(DataType.f32, DataType.cf32), | |||
(DataType.f64, DataType.cf64), | |||
] | |||
# | |||
def is_complex(data_type): | |||
for r, c in RealComplexBijection: | |||
if data_type == c: | |||
return True | |||
return False | |||
# | |||
def get_complex_from_real(real_type): | |||
for r, c in RealComplexBijection: | |||
if real_type == r: | |||
return c | |||
return DataType.invalid | |||
# | |||
def get_real_from_complex(complex_type): | |||
for r, c in RealComplexBijection: | |||
if complex_type == c: | |||
return r | |||
return DataType.invalid | |||
# | |||
class ComplexMultiplyOp(enum.Enum): | |||
multiply_add = enum_auto() | |||
gaussian = enum_auto() | |||
################################################################################################### | |||
# | |||
class MathOperation(enum.Enum): | |||
multiply_add = enum_auto() | |||
multiply_add_saturate = enum_auto() | |||
xor_popc = enum_auto() | |||
multiply_add_fast_bf16 = enum_auto() | |||
multiply_add_fast_f16 = enum_auto() | |||
multiply_add_complex = enum_auto() | |||
multiply_add_complex_gaussian = enum_auto() | |||
# | |||
MathOperationTag = { | |||
MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd', | |||
MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate', | |||
MathOperation.xor_popc: 'cutlass::arch::OpXorPopc', | |||
MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16', | |||
MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16', | |||
MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex', | |||
MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex', | |||
} | |||
################################################################################################### | |||
# | |||
class LayoutType(enum.Enum): | |||
ColumnMajor = enum_auto() | |||
RowMajor = enum_auto() | |||
ColumnMajorInterleaved2 = enum_auto() | |||
RowMajorInterleaved2 = enum_auto() | |||
ColumnMajorInterleaved32 = enum_auto() | |||
RowMajorInterleaved32 = enum_auto() | |||
ColumnMajorInterleaved64 = enum_auto() | |||
RowMajorInterleaved64 = enum_auto() | |||
TensorNHWC = enum_auto() | |||
TensorNDHWC = enum_auto() | |||
TensorNCHW = enum_auto() | |||
TensorNGHWC = enum_auto() | |||
TensorNC4HW4 = enum_auto() | |||
TensorC4RSK4 = enum_auto() | |||
TensorNC8HW8 = enum_auto() | |||
TensorNC16HW16 = enum_auto() | |||
TensorNC32HW32 = enum_auto() | |||
TensorNC64HW64 = enum_auto() | |||
TensorC32RSK32 = enum_auto() | |||
TensorC64RSK64 = enum_auto() | |||
TensorK4RSC4 = enum_auto() | |||
# | |||
LayoutTag = { | |||
LayoutType.ColumnMajor: 'cutlass::layout::ColumnMajor', | |||
LayoutType.RowMajor: 'cutlass::layout::RowMajor', | |||
LayoutType.ColumnMajorInterleaved2: 'cutlass::layout::ColumnMajorInterleaved<2>', | |||
LayoutType.RowMajorInterleaved2: 'cutlass::layout::RowMajorInterleaved<2>', | |||
LayoutType.ColumnMajorInterleaved32: 'cutlass::layout::ColumnMajorInterleaved<32>', | |||
LayoutType.RowMajorInterleaved32: 'cutlass::layout::RowMajorInterleaved<32>', | |||
LayoutType.ColumnMajorInterleaved64: 'cutlass::layout::ColumnMajorInterleaved<64>', | |||
LayoutType.RowMajorInterleaved64: 'cutlass::layout::RowMajorInterleaved<64>', | |||
LayoutType.TensorNHWC: 'cutlass::layout::TensorNHWC', | |||
LayoutType.TensorNDHWC: 'cutlass::layout::TensorNDHWC', | |||
LayoutType.TensorNCHW: 'cutlass::layout::TensorNCHW', | |||
LayoutType.TensorNGHWC: 'cutlass::layout::TensorNGHWC', | |||
LayoutType.TensorNC4HW4: 'cutlass::layout::TensorNCxHWx<4>', | |||
LayoutType.TensorC4RSK4: 'cutlass::layout::TensorCxRSKx<4>', | |||
LayoutType.TensorNC8HW8: 'cutlass::layout::TensorNCxHWx<8>', | |||
LayoutType.TensorNC16HW16: 'cutlass::layout::TensorNCxHWx<16>', | |||
LayoutType.TensorNC32HW32: 'cutlass::layout::TensorNCxHWx<32>', | |||
LayoutType.TensorC32RSK32: 'cutlass::layout::TensorCxRSKx<32>', | |||
LayoutType.TensorNC64HW64: 'cutlass::layout::TensorNCxHWx<64>', | |||
LayoutType.TensorC64RSK64: 'cutlass::layout::TensorCxRSKx<64>', | |||
LayoutType.TensorK4RSC4: 'cutlass::layout::TensorKxRSCx<4>', | |||
} | |||
# | |||
TransposedLayout = { | |||
LayoutType.ColumnMajor: LayoutType.RowMajor, | |||
LayoutType.RowMajor: LayoutType.ColumnMajor, | |||
LayoutType.ColumnMajorInterleaved2: LayoutType.RowMajorInterleaved2, | |||
LayoutType.RowMajorInterleaved2: LayoutType.ColumnMajorInterleaved2, | |||
LayoutType.ColumnMajorInterleaved32: LayoutType.RowMajorInterleaved32, | |||
LayoutType.RowMajorInterleaved32: LayoutType.ColumnMajorInterleaved32, | |||
LayoutType.ColumnMajorInterleaved64: LayoutType.RowMajorInterleaved64, | |||
LayoutType.RowMajorInterleaved64: LayoutType.ColumnMajorInterleaved64, | |||
LayoutType.TensorNHWC: LayoutType.TensorNHWC | |||
} | |||
# | |||
ShortLayoutTypeNames = { | |||
LayoutType.ColumnMajor: 'n', | |||
LayoutType.ColumnMajorInterleaved32: 'n2', | |||
LayoutType.ColumnMajorInterleaved32: 'n32', | |||
LayoutType.ColumnMajorInterleaved64: 'n64', | |||
LayoutType.RowMajor: 't', | |||
LayoutType.RowMajorInterleaved2: 't2', | |||
LayoutType.RowMajorInterleaved32: 't32', | |||
LayoutType.RowMajorInterleaved64: 't64', | |||
LayoutType.TensorNHWC: 'nhwc', | |||
LayoutType.TensorNDHWC: 'ndhwc', | |||
LayoutType.TensorNCHW: 'nchw', | |||
LayoutType.TensorNGHWC: 'nghwc', | |||
LayoutType.TensorNC4HW4: 'nc4hw4', | |||
LayoutType.TensorC4RSK4: 'c4rsk4', | |||
LayoutType.TensorNC8HW8: 'nc8hw8', | |||
LayoutType.TensorNC16HW16: 'nc16hw16', | |||
LayoutType.TensorNC32HW32: 'nc32hw32', | |||
LayoutType.TensorNC64HW64: 'nc64hw64', | |||
LayoutType.TensorC32RSK32: 'c32rsk32', | |||
LayoutType.TensorC64RSK64: 'c64rsk64', | |||
LayoutType.TensorK4RSC4: 'k4rsc4', | |||
} | |||
# | |||
ShortComplexLayoutNames = { | |||
(LayoutType.ColumnMajor, ComplexTransform.none): 'n', | |||
(LayoutType.ColumnMajor, ComplexTransform.conj): 'c', | |||
(LayoutType.RowMajor, ComplexTransform.none): 't', | |||
(LayoutType.RowMajor, ComplexTransform.conj): 'h' | |||
} | |||
################################################################################################### | |||
# | |||
class OpcodeClass(enum.Enum): | |||
Simt = enum_auto() | |||
TensorOp = enum_auto() | |||
WmmaTensorOp = enum_auto() | |||
OpcodeClassNames = { | |||
OpcodeClass.Simt: 'simt', | |||
OpcodeClass.TensorOp: 'tensorop', | |||
OpcodeClass.WmmaTensorOp: 'wmma_tensorop', | |||
} | |||
OpcodeClassTag = { | |||
OpcodeClass.Simt: 'cutlass::arch::OpClassSimt', | |||
OpcodeClass.TensorOp: 'cutlass::arch::OpClassTensorOp', | |||
OpcodeClass.WmmaTensorOp: 'cutlass::arch::OpClassWmmaTensorOp', | |||
} | |||
################################################################################################### | |||
# | |||
class OperationKind(enum.Enum): | |||
Gemm = enum_auto() | |||
Conv2d = enum_auto() | |||
# | |||
OperationKindNames = { | |||
OperationKind.Gemm: 'gemm' | |||
, OperationKind.Conv2d: 'conv2d' | |||
} | |||
# | |||
class Target(enum.Enum): | |||
library = enum_auto() | |||
ArchitectureNames = { | |||
50: 'maxwell', | |||
60: 'pascal', | |||
61: 'pascal', | |||
70: 'volta', | |||
75: 'turing', | |||
80: 'ampere', | |||
} | |||
################################################################################################### | |||
# | |||
def SubstituteTemplate(template, values): | |||
text = template | |||
changed = True | |||
while changed: | |||
changed = False | |||
for key, value in values.items(): | |||
regex = "\\$\\{%s\\}" % key | |||
newtext = re.sub(regex, value, text) | |||
if newtext != text: | |||
changed = True | |||
text = newtext | |||
return text | |||
################################################################################################### | |||
# | |||
class GemmKind(enum.Enum): | |||
Gemm = enum_auto() | |||
Sparse = enum_auto() | |||
Universal = enum_auto() | |||
PlanarComplex = enum_auto() | |||
PlanarComplexArray = enum_auto() | |||
SplitKParallel = enum_auto() | |||
GemvBatchedStrided = enum_auto() | |||
# | |||
GemmKindNames = { | |||
GemmKind.Gemm: "gemm", | |||
GemmKind.Sparse: "spgemm", | |||
GemmKind.Universal: "gemm", | |||
GemmKind.PlanarComplex: "gemm_planar_complex", | |||
GemmKind.PlanarComplexArray: "gemm_planar_complex_array", | |||
GemmKind.SplitKParallel: "gemm_split_k_parallel", | |||
GemmKind.GemvBatchedStrided: "gemv_batched_strided", | |||
} | |||
# | |||
class EpilogueFunctor(enum.Enum): | |||
LinearCombination = enum_auto() | |||
LinearCombinationClamp = enum_auto() | |||
BiasAddLinearCombination = enum_auto() | |||
BiasAddLinearCombinationRelu = enum_auto() | |||
BiasAddLinearCombinationHSwish = enum_auto() | |||
BiasAddLinearCombinationClamp = enum_auto() | |||
BiasAddLinearCombinationReluClamp = enum_auto() | |||
BiasAddLinearCombinationHSwishClamp = enum_auto() | |||
# | |||
EpilogueFunctorTag = { | |||
EpilogueFunctor.LinearCombination: 'cutlass::epilogue::thread::LinearCombination', | |||
EpilogueFunctor.LinearCombinationClamp: 'cutlass::epilogue::thread::LinearCombinationClamp', | |||
EpilogueFunctor.BiasAddLinearCombination: 'cutlass::epilogue::thread::BiasAddLinearCombination', | |||
EpilogueFunctor.BiasAddLinearCombinationRelu: 'cutlass::epilogue::thread::BiasAddLinearCombinationRelu', | |||
EpilogueFunctor.BiasAddLinearCombinationHSwish: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwish', | |||
EpilogueFunctor.BiasAddLinearCombinationClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationClamp', | |||
EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp', | |||
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp', | |||
} | |||
# | |||
ShortEpilogueNames = { | |||
EpilogueFunctor.BiasAddLinearCombinationHSwishClamp: 'hswish', | |||
EpilogueFunctor.BiasAddLinearCombinationReluClamp: 'relu', | |||
EpilogueFunctor.BiasAddLinearCombinationClamp: 'identity', | |||
EpilogueFunctor.BiasAddLinearCombinationHSwish: 'hswish', | |||
EpilogueFunctor.BiasAddLinearCombinationRelu: 'relu', | |||
EpilogueFunctor.BiasAddLinearCombination: 'identity', | |||
} | |||
# | |||
class SwizzlingFunctor(enum.Enum): | |||
Identity1 = enum_auto() | |||
Identity2 = enum_auto() | |||
Identity4 = enum_auto() | |||
Identity8 = enum_auto() | |||
ConvFpropNCxHWx = enum_auto() | |||
ConvFpropNHWC = enum_auto() | |||
ConvDgradNCxHWx = enum_auto() | |||
# | |||
SwizzlingFunctorTag = { | |||
SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>', | |||
SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>', | |||
SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>', | |||
SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>', | |||
SwizzlingFunctor.ConvFpropNCxHWx: 'cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle', | |||
SwizzlingFunctor.ConvFpropNHWC: 'cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle', | |||
SwizzlingFunctor.ConvDgradNCxHWx: 'cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle', | |||
} | |||
################################################################################################### | |||
class ConvType(enum.Enum): | |||
Convolution = enum_auto() | |||
BatchConvolution = enum_auto() | |||
Local = enum_auto() | |||
LocalShare = enum_auto() | |||
ConvTypeTag = { | |||
ConvType.Convolution: 'cutlass::conv::ConvType::kConvolution', | |||
ConvType.BatchConvolution: 'cutlass::conv::ConvType::kBatchConvolution', | |||
ConvType.Local: 'cutlass::conv::ConvType::kLocal', | |||
ConvType.LocalShare : 'cutlass::conv::ConvType::kLocalShare', | |||
} | |||
# | |||
class ConvKind(enum.Enum): | |||
Fprop = enum_auto() | |||
Dgrad = enum_auto() | |||
Wgrad = enum_auto() | |||
# | |||
ConvKindTag = { | |||
ConvKind.Fprop: 'cutlass::conv::Operator::kFprop', | |||
ConvKind.Dgrad: 'cutlass::conv::Operator::kDgrad', | |||
ConvKind.Wgrad: 'cutlass::conv::Operator::kWgrad' | |||
} | |||
ConvKindNames = { | |||
ConvKind.Fprop: 'fprop', | |||
ConvKind.Dgrad: 'dgrad', | |||
ConvKind.Wgrad: 'wgrad', | |||
} | |||
# | |||
class IteratorAlgorithm(enum.Enum): | |||
Analytic = enum_auto() | |||
Optimized = enum_auto() | |||
# | |||
IteratorAlgorithmTag = { | |||
IteratorAlgorithm.Analytic: 'cutlass::conv::IteratorAlgorithm::kAnalytic', | |||
IteratorAlgorithm.Optimized: 'cutlass::conv::IteratorAlgorithm::kOptimized', | |||
} | |||
IteratorAlgorithmNames = { | |||
IteratorAlgorithm.Analytic: 'analytic', | |||
IteratorAlgorithm.Optimized: 'optimized', | |||
} | |||
# | |||
class StrideSupport(enum.Enum): | |||
Strided = enum_auto() | |||
Unity = enum_auto() | |||
# | |||
StrideSupportTag = { | |||
StrideSupport.Strided: 'cutlass::conv::StrideSupport::kStrided', | |||
StrideSupport.Unity: 'cutlass::conv::StrideSupport::kUnity', | |||
} | |||
StrideSupportNames = { | |||
StrideSupport.Strided: '', | |||
StrideSupport.Unity: 'unity_stride', | |||
} | |||
class ImplicitGemmMode(enum.Enum): | |||
GemmNt = enum_auto() | |||
GemmTn = enum_auto() | |||
ImplicitGemmModeNames = { | |||
ImplicitGemmMode.GemmNt: 'gemm_nt', | |||
ImplicitGemmMode.GemmTn: 'gemm_tn', | |||
} | |||
ImplicitGemmModeTag = { | |||
ImplicitGemmMode.GemmNt: 'cutlass::conv::ImplicitGemmMode::GEMM_NT', | |||
ImplicitGemmMode.GemmTn: 'cutlass::conv::ImplicitGemmMode::GEMM_TN', | |||
} | |||
################################################################################################### | |||
# | |||
class MathInstruction: | |||
def __init__(self, instruction_shape, element_a, element_b, element_accumulator, opcode_class, math_operation = MathOperation.multiply_add): | |||
self.instruction_shape = instruction_shape | |||
self.element_a = element_a | |||
self.element_b = element_b | |||
self.element_accumulator = element_accumulator | |||
self.opcode_class = opcode_class | |||
self.math_operation = math_operation | |||
# | |||
class TileDescription: | |||
def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute): | |||
self.threadblock_shape = threadblock_shape | |||
self.stages = stages | |||
self.warp_count = warp_count | |||
self.math_instruction = math_instruction | |||
self.minimum_compute_capability = min_compute | |||
self.maximum_compute_capability = max_compute | |||
def procedural_name(self): | |||
return "%dx%d_%dx%d" % (self.threadblock_shape[0], self.threadblock_shape[1], self.threadblock_shape[2], self.stages) | |||
# | |||
class TensorDescription: | |||
def __init__(self, element, layout, alignment = 1, complex_transform = ComplexTransform.none): | |||
self.element = element | |||
self.layout = layout | |||
self.alignment = alignment | |||
self.complex_transform = complex_transform | |||
################################################################################################### |
@@ -0,0 +1,578 @@ | |||
# Generated by dnn/scripts/cutlass_generator/gen_list.py | |||
cutlass_gen_list = [ | |||
"cutlass_simt_sgemm_8x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_16x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_16x64_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_32x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_32x64_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_64x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_16x128_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_32x128_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_64x64_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_128x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_64x128_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_128x64_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_32x256_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_64x256_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_128x128_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_256x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_256x64_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu", | |||
"cutlass_simt_sgemm_8x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_16x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_16x64_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_32x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_32x64_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_64x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_16x128_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_32x128_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_64x64_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_128x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_64x128_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_128x64_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_32x256_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_64x256_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_128x128_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_256x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_256x64_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu", | |||
"cutlass_simt_sgemm_8x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_16x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_16x64_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_32x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_32x64_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_64x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_16x128_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_32x128_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_64x64_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_128x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_64x128_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_128x64_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_32x256_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_64x256_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_128x128_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_256x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_256x64_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu", | |||
"cutlass_simt_sgemm_8x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_16x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_16x64_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_32x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_32x64_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_64x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_16x128_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_32x128_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_64x64_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_128x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_64x128_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_128x64_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_32x256_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_64x256_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_128x128_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_256x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_256x64_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu", | |||
"cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu", | |||
"cutlass_simt_s8_idgrad_identity_s8_64x128x32_64x32x32_2_nc4hw4_k4rsc4.cu", | |||
"cutlass_simt_s8_idgrad_identity_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu", | |||
"cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu", | |||
"cutlass_simt_s8_idgrad_identity_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu", | |||
"cutlass_simt_s8_idgrad_identity_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_s8_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nc32hw32.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_32x16x64_2_nc32hw32_c32rsk32.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x64x64_32x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_identity_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x64x64_16x32x64_2_nc32hw32_c32rsk32_nc4hw4.cu", | |||
"cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
"cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu", | |||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8.cu", | |||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16.cu", | |||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
"cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32.cu", | |||
] |
@@ -0,0 +1,351 @@ | |||
# | |||
# \file generator.py | |||
# | |||
# \brief Generates the CUTLASS Library's instances | |||
# | |||
import enum | |||
import os.path | |||
import shutil | |||
from library import * | |||
from gemm_operation import * | |||
from conv2d_operation import * | |||
################################################################################################### | |||
class EmitOperationKindLibrary: | |||
def __init__(self, generated_path, kind, args): | |||
self.generated_path = generated_path | |||
self.kind = kind | |||
self.args = args | |||
self.emitters = { | |||
OperationKind.Gemm: EmitGemmConfigurationLibrary | |||
, OperationKind.Conv2d: EmitConv2dConfigurationLibrary | |||
} | |||
self.configurations = []; | |||
self.header_template =""" | |||
/* | |||
Generated by manifest.py - Do not edit. | |||
*/ | |||
#include "cutlass/cutlass.h" | |||
#include "cutlass/library/library.h" | |||
#include "cutlass/library/manifest.h" | |||
namespace cutlass { | |||
namespace library { | |||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||
""" | |||
self.entry_template = """ | |||
// | |||
// Entry point to construct operations | |||
// | |||
void initialize_all_${operation_name}_operations(Manifest &manifest) { | |||
""" | |||
self.configuration_prototype_template = "void initialize_${configuration_name}(Manifest &manifest);\n" | |||
self.configuration_template =" initialize_${configuration_name}(manifest);\n" | |||
self.epilogue_template =""" | |||
} | |||
/////////////////////////////////////////////////////////////////////////////////////////////////// | |||
} // namespace library | |||
} // namespace cutlass | |||
""" | |||
# | |||
def __enter__(self): | |||
self.operation_path = os.path.join(self.generated_path, OperationKindNames[self.kind]) | |||
os.mkdir(self.operation_path) | |||
self.top_level_path = os.path.join(self.operation_path, "all_%s_operations.cu" % OperationKindNames[self.kind]) | |||
self.top_level_file = open(self.top_level_path, "w") | |||
self.top_level_file.write(self.header_template) | |||
self.source_files = [self.top_level_path,] | |||
return self | |||
# | |||
def emit(self, configuration_name, operations): | |||
with self.emitters[self.kind](self.operation_path, configuration_name) as configuration_emitter: | |||
for operation in operations: | |||
configuration_emitter.emit(operation) | |||
self.source_files.append(configuration_emitter.configuration_path) | |||
self.configurations.append(configuration_name) | |||
self.top_level_file.write(SubstituteTemplate(self.configuration_prototype_template, {'configuration_name': configuration_name} )) | |||
# | |||
def __exit__(self, exception_type, exception_value, traceback): | |||
self.top_level_file.write(SubstituteTemplate(self.entry_template, {'operation_name': OperationKindNames[self.kind]})) | |||
for configuration_name in self.configurations: | |||
self.top_level_file.write(SubstituteTemplate(self.configuration_template, {'configuration_name': configuration_name})) | |||
self.top_level_file.write(self.epilogue_template) | |||
self.top_level_file.close() | |||
################################################################################################### | |||
################################################################################################### | |||
class Options: | |||
def __init__(self): | |||
pass | |||
################################################################################################### | |||
# | |||
class Manifest: | |||
# | |||
def __init__(self, args): | |||
self.operations = {} | |||
self.args = args | |||
architectures = args.architectures.split(';') if len(args.architectures) else ['50',] | |||
self.compute_capabilities = [int(x) for x in architectures] | |||
self.selected_kernels = [] | |||
if args.operations == 'all': | |||
self.operations_enabled = [] | |||
else: | |||
operations_list = [ | |||
OperationKind.Gemm | |||
, OperationKind.Conv2d | |||
] | |||
self.operations_enabled = [x for x in operations_list if OperationKindNames[x] in args.operations.split(',')] | |||
if args.kernels == 'all': | |||
self.kernel_names = [] | |||
else: | |||
self.kernel_names = [x for x in args.kernels.split(',') if x != ''] | |||
self.ignore_kernel_names = [x for x in args.ignore_kernels.split(',') if x != ''] | |||
if args.kernel_filter_file is None: | |||
self.kernel_filter_list = [] | |||
else: | |||
self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file) | |||
self.operation_count = 0 | |||
self.operations_by_name = {} | |||
self.top_level_prologue = ''' | |||
#include "cutlass/library/library.h" | |||
#include "cutlass/library/manifest.h" | |||
namespace cutlass { | |||
namespace library { | |||
${prototypes} | |||
void initialize_all(Manifest &manifest) { | |||
''' | |||
self.top_level_reserve = ' manifest.reserve(${operation_count});\n\n' | |||
self.top_level_epilogue = ''' | |||
} | |||
} // namespace library | |||
} // namespace cutlass | |||
''' | |||
def get_kernel_filters (self, kernelListFile): | |||
if os.path.isfile(kernelListFile): | |||
with open(kernelListFile, 'r') as fileReader: | |||
lines = [line.rstrip() for line in fileReader if not line.startswith("#")] | |||
lines = [re.compile(line) for line in lines if line] | |||
return lines | |||
else: | |||
return [] | |||
def filter_out_kernels(self, kernel_name, kernel_filter_list): | |||
for kernel_filter_re in kernel_filter_list: | |||
if kernel_filter_re.search(kernel_name) is not None: | |||
return True | |||
return False | |||
# | |||
def _filter_string_matches(self, filter_string, haystack): | |||
''' Returns true if all substrings appear in the haystack in order''' | |||
substrings = filter_string.split('*') | |||
for sub in substrings: | |||
idx = haystack.find(sub) | |||
if idx < 0: | |||
return False | |||
haystack = haystack[idx + len(sub):] | |||
return True | |||
# | |||
def filter(self, operation): | |||
''' Filtering operations based on various criteria''' | |||
# filter based on compute capability | |||
enabled = False | |||
for cc in self.compute_capabilities: | |||
if cc >= operation.tile_description.minimum_compute_capability and \ | |||
cc <= operation.tile_description.maximum_compute_capability: | |||
enabled = True | |||
break | |||
if not enabled: | |||
return False | |||
if len(self.operations_enabled) and not operation.operation_kind in self.operations_enabled: | |||
return False | |||
# eliminate duplicates | |||
if operation.procedural_name() in self.operations_by_name.keys(): | |||
return False | |||
# Filter based on list of valid substrings | |||
if len(self.kernel_names): | |||
name = operation.procedural_name() | |||
enabled = False | |||
# compare against the include list | |||
for name_substr in self.kernel_names: | |||
if self._filter_string_matches(name_substr, name): | |||
enabled = True | |||
break | |||
# compare against the exclude list | |||
for name_substr in self.ignore_kernel_names: | |||
if self._filter_string_matches(name_substr, name): | |||
enabled = False | |||
break | |||
if len(self.kernel_filter_list) > 0: | |||
enabled = False | |||
if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list): | |||
enabled = True | |||
# todo: filter based on compute data type | |||
return enabled | |||
# | |||
# | |||
def append(self, operation): | |||
''' | |||
Inserts the operation. | |||
operation_kind -> configuration_name -> [] | |||
''' | |||
if self.filter(operation): | |||
self.selected_kernels.append(operation.procedural_name()) | |||
self.operations_by_name[operation.procedural_name()] = operation | |||
# add the configuration | |||
configuration_name = operation.configuration_name() | |||
if operation.operation_kind not in self.operations.keys(): | |||
self.operations[operation.operation_kind] = {} | |||
if configuration_name not in self.operations[operation.operation_kind].keys(): | |||
self.operations[operation.operation_kind][configuration_name] = [] | |||
self.operations[operation.operation_kind][configuration_name].append(operation) | |||
self.operation_count += 1 | |||
# | |||
# | |||
def emit(self, target = GeneratorTarget.Library): | |||
operation_emitters = { | |||
GeneratorTarget.Library: EmitOperationKindLibrary | |||
} | |||
generated_path = os.path.join(self.args.curr_build_dir, 'generated') | |||
# create generated/ | |||
if os.path.exists(generated_path): | |||
shutil.rmtree(generated_path) | |||
os.mkdir(generated_path) | |||
source_files = [] | |||
top_level_path = os.path.join(generated_path, 'initialize_all.cpp') | |||
with open(top_level_path, 'w') as top_level_file: | |||
if target == GeneratorTarget.Library: | |||
source_files.append(top_level_path) | |||
prototypes = [] | |||
for operation_kind, configurations in self.operations.items(): | |||
prototypes.append(SubstituteTemplate( | |||
"void initialize_all_${operation_kind}_operations(Manifest &manifest);", | |||
{'operation_kind': OperationKindNames[operation_kind]})) | |||
top_level_file.write(SubstituteTemplate(self.top_level_prologue, | |||
{'prototypes': "\n".join(prototypes)})) | |||
top_level_file.write(SubstituteTemplate( | |||
self.top_level_reserve, {'operation_count': str(self.operation_count)})) | |||
# for each operation kind, emit initializer for all configurations | |||
for operation_kind, configurations in self.operations.items(): | |||
with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter: | |||
for configuration_name, operations in configurations.items(): | |||
operation_kind_emitter.emit(configuration_name, operations) | |||
source_files += operation_kind_emitter.source_files | |||
top_level_file.write(SubstituteTemplate( | |||
" initialize_all_${operation_kind}_operations(manifest);\n", | |||
{'operation_kind': OperationKindNames[operation_kind]})) | |||
top_level_file.write(self.top_level_epilogue) | |||
# write the manifest.cmake file containing paths from all targets | |||
manifest_path = os.path.join(generated_path, "manifest.cmake") | |||
with open(manifest_path, "w") as manifest_file: | |||
target_name = 'cutlass_library_objs' | |||
target_text = SubstituteTemplate("""cutlass_target_sources( | |||
${target_name} | |||
BATCH_SOURCES ON | |||
PRIVATE | |||
""", { 'target_name': target_name}) | |||
manifest_file.write(target_text) | |||
for source_file in source_files: | |||
manifest_file.write(" %s\n" % str(source_file.replace('\\', '/'))) | |||
manifest_file.write(")") | |||
# | |||
################################################################################################### |
@@ -113,6 +113,31 @@ if(MGE_WITH_CUDA) | |||
list(APPEND SOURCES ${SOURCES_}) | |||
file(GLOB_RECURSE CUSOURCES cuda/*.cu) | |||
set(CUTLASS_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/../scripts/cutlass_generator/generator.py) | |||
set(CUTLASS_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/cuda/cutlass/generated) | |||
function(gen_cutlass_kimpl op type) | |||
set(CURRENT_CUTLASS_GEN_DIR ${CUTLASS_GEN_DIR}/${op}_${type}) | |||
file(MAKE_DIRECTORY ${CURRENT_CUTLASS_GEN_DIR}) | |||
execute_process( | |||
COMMAND ${PYTHON3_EXECUTABLE_WITHOUT_VERSION} ${CUTLASS_GEN_SCRIPT} --operations ${op} --type ${type} ${CURRENT_CUTLASS_GEN_DIR} | |||
RESULT_VARIABLE gen_cutlass_result | |||
OUTPUT_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log | |||
ERROR_FILE ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log | |||
) | |||
if (NOT gen_cutlass_result EQUAL 0) | |||
message(FATAL_ERROR "Error generating library instances. See ${CURRENT_CUTLASS_GEN_DIR}/gen_cutlass.log") | |||
endif() | |||
endfunction() | |||
gen_cutlass_kimpl(gemm simt) | |||
gen_cutlass_kimpl(gemv simt) | |||
gen_cutlass_kimpl(deconv simt) | |||
gen_cutlass_kimpl(conv2d simt) | |||
gen_cutlass_kimpl(conv2d tensorop8816) | |||
gen_cutlass_kimpl(conv2d tensorop8832) | |||
file(GLOB_RECURSE CUTLASS_SOURCES ${CUTLASS_GEN_DIR}/*.cu) | |||
list(APPEND SOURCES ${CUTLASS_SOURCES}) | |||
list(APPEND SOURCES ${CUSOURCES}) | |||
endif() | |||
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorCxRSKx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 128, 128>, | |||
cutlass::gemm::GemmShape<64, 64, 128>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
16, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorCxRSKx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<256, 128, 128>, | |||
cutlass::gemm::GemmShape<64, 64, 128>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
cutlass::int4b_t, | |||
16, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorCxRSKx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 128, 128>, | |||
cutlass::gemm::GemmShape<64, 64, 128>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
16, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorCxRSKx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<256, 128, 128>, | |||
cutlass::gemm::GemmShape<64, 64, 128>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::int4b_t, | |||
16, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorCxRSKx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 128, 128>, | |||
cutlass::gemm::GemmShape<64, 64, 128>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
16, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorCxRSKx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<256, 128, 128>, | |||
cutlass::gemm::GemmShape<64, 64, 128>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::int4b_t, | |||
16, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorCxRSKx<64>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 128, 128>, | |||
cutlass::gemm::GemmShape<64, 64, 128>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
16, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorCxRSKx<64>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<256, 128, 128>, | |||
cutlass::gemm::GemmShape<64, 64, 128>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
cutlass::uint4b_t, | |||
16, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorCxRSKx<64>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 128, 128>, | |||
cutlass::gemm::GemmShape<64, 64, 128>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
16, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 32, 64>, | |||
cutlass::gemm::GemmShape<64, 32, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc16hw16" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<16>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
16, | |||
16, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc32hw32" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<32>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_2_nhwc_nc8hw8" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorNCxHWx<8>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::layout::TensorNHWC, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<128, 64, 64>, | |||
cutlass::gemm::GemmShape<64, 64, 64>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
8, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNHWCThreadblockSwizzle, | |||
2, | |||
8, | |||
8, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_TN>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64_c64rsk64" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
cutlass::int4b_t, | |||
cutlass::layout::TensorCxRSKx<64>, | |||
cutlass::uint4b_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::layout::TensorNCxHWx<64>, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassTensorOp, | |||
cutlass::arch::Sm75, | |||
cutlass::gemm::GemmShape<256, 128, 128>, | |||
cutlass::gemm::GemmShape<64, 64, 128>, | |||
cutlass::gemm::GemmShape<8, 8, 32>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
cutlass::uint4b_t, | |||
16, | |||
int32_t, | |||
int32_t, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
32, | |||
32, | |||
true, | |||
cutlass::arch::OpMultiplyAddSaturate, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<128, 128, 32>, | |||
cutlass::gemm::GemmShape<64, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<128, 32, 32>, | |||
cutlass::gemm::GemmShape<64, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<128, 64, 32>, | |||
cutlass::gemm::GemmShape<64, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<16, 128, 16>, | |||
cutlass::gemm::GemmShape<16, 128, 16>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
1, | |||
4, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<16, 64, 8>, | |||
cutlass::gemm::GemmShape<16, 64, 8>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
4, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<32, 128, 32>, | |||
cutlass::gemm::GemmShape<32, 64, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<32, 32, 32>, | |||
cutlass::gemm::GemmShape<32, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<32, 64, 32>, | |||
cutlass::gemm::GemmShape<32, 64, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<64, 128, 32>, | |||
cutlass::gemm::GemmShape<64, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<64, 32, 32>, | |||
cutlass::gemm::GemmShape<64, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_hswish_s8_64x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<64, 64, 32>, | |||
cutlass::gemm::GemmShape<64, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<128, 128, 32>, | |||
cutlass::gemm::GemmShape<64, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<128, 32, 32>, | |||
cutlass::gemm::GemmShape<64, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<128, 64, 32>, | |||
cutlass::gemm::GemmShape<64, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<16, 128, 16>, | |||
cutlass::gemm::GemmShape<16, 128, 16>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
1, | |||
4, | |||
8, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<16, 64, 8>, | |||
cutlass::gemm::GemmShape<16, 64, 8>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
4, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<32, 128, 32>, | |||
cutlass::gemm::GemmShape<32, 64, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |
@@ -1,59 +0,0 @@ | |||
#if !MEGDNN_TEGRA_X1 | |||
// ignore warning of cutlass | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" | |||
// kernel instance "cutlass_simt_f32_ifprop_1x1_identity_s8_32x32x32_32x32x32_2_nc4hw4_c4rsk4_nchw" generated by cutlass generator | |||
using Convolution = | |||
typename cutlass::conv::device::Convolution< | |||
int8_t, | |||
cutlass::layout::TensorNCxHWx<4>, | |||
int8_t, | |||
cutlass::layout::TensorCxRSKx<4>, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
float, | |||
cutlass::layout::TensorNCHW, | |||
int32_t, | |||
cutlass::conv::ConvType::kConvolution, | |||
cutlass::arch::OpClassSimt, | |||
cutlass::arch::Sm61, | |||
cutlass::gemm::GemmShape<32, 32, 32>, | |||
cutlass::gemm::GemmShape<32, 32, 32>, | |||
cutlass::gemm::GemmShape<1, 1, 4>, | |||
cutlass::epilogue::thread::BiasAddLinearCombination< | |||
float, | |||
1, | |||
int32_t, | |||
float, | |||
float | |||
>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, | |||
4, | |||
16, | |||
false, | |||
cutlass::arch::OpMultiplyAdd, | |||
cutlass::conv::ImplicitGemmMode::GEMM_NT>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
const typename Convolution::ElementSrc* d_src, | |||
const typename Convolution::ElementFilter* d_filter, | |||
const typename Convolution::ElementBias* d_bias, | |||
const typename Convolution::ElementDst* d_z, | |||
typename Convolution::ElementDst* d_dst, | |||
int* workspace, | |||
typename Convolution::ConvolutionParameter const& conv_param, | |||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||
cudaStream_t stream, | |||
typename Convolution::ExtraParam extra_param); | |||
#pragma GCC diagnostic pop | |||
#endif |