|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703 |
- #
- # \file generator.py
- #
- # \brief Generates the CUTLASS Library's instances
- #
-
- import enum
- import os.path
- import shutil
- import argparse
- import platform
-
- from library import *
- from manifest import *
-
- ###################################################################################################
-
- #
- def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch=0):
-
- # by default, use the latest CUDA Toolkit version
- cuda_version = [11, 0, 132]
-
- # Update cuda_version based on parsed string
- if semantic_ver_string != "":
- for i, x in enumerate([int(x) for x in semantic_ver_string.split(".")]):
- if i < len(cuda_version):
- cuda_version[i] = x
- else:
- cuda_version.append(x)
- return cuda_version >= [major, minor, patch]
-
-
- ###################################################################################################
- ###################################################################################################
-
- #
- def CreateGemmOperator(
- manifest,
- layouts,
- tile_descriptions,
- data_type,
- alignment_constraints,
- complex_transforms=None,
- epilogue_functor=EpilogueFunctor.LinearCombination,
- swizzling_functor=SwizzlingFunctor.Identity8,
- ):
-
- if complex_transforms is None:
- complex_transforms = [(ComplexTransform.none, ComplexTransform.none)]
-
- element_a, element_b, element_c, element_epilogue = data_type
-
- operations = []
-
- # by default, only generate the largest tile and largest alignment
- if manifest.args.kernels == "":
- tile_descriptions = [tile_descriptions[0]]
- alignment_constraints = [alignment_constraints[0]]
-
- for layout in layouts:
- for tile_description in tile_descriptions:
- for alignment in alignment_constraints:
- for complex_transform in complex_transforms:
-
- alignment_c = min(8, alignment)
-
- A = TensorDescription(
- element_a, layout[0], alignment, complex_transform[0]
- )
- B = TensorDescription(
- element_b, layout[1], alignment, complex_transform[1]
- )
- C = TensorDescription(element_c, layout[2], alignment_c)
-
- new_operation = GemmOperation(
- GemmKind.Universal,
- tile_description.minimum_compute_capability,
- tile_description,
- A,
- B,
- C,
- element_epilogue,
- epilogue_functor,
- swizzling_functor,
- )
-
- manifest.append(new_operation)
- operations.append(new_operation)
-
- return operations
-
-
- ###########################################################################################################
- # ConvolutionOperator support variations
- # ____________________________________________________________________
- # ConvolutionalOperator | Analytic | Optimized
- # ____________________________________________________________________
- # | Fprop | (strided) | (strided)
- # | Dgrad | (strided, unity*) | (unity)
- # | Wgrad | (strided) | (strided)
- # ____________________________________________________________________
- #
- # Note : Operator marked (*) are supported but not generated to keep the instantiated kernel count low
- ###########################################################################################################
- # Convolution for 2D operations
- def CreateConv2dOperator(
- manifest,
- layout,
- tile_descriptions,
- data_type,
- alignment,
- conv_kinds=[ConvKind.Fprop, ConvKind.Dgrad, ConvKind.Wgrad],
- epilogue_functor=EpilogueFunctor.LinearCombination,
- ):
-
- element_a, element_b, element_c, element_epilogue = data_type
-
- # one exceptional case
- alignment_c = min(8, alignment)
-
- # iterator algorithm (analytic and optimized)
- iterator_algorithms = [IteratorAlgorithm.Analytic, IteratorAlgorithm.Optimized]
-
- # by default, only generate the largest tile size
- if manifest.args.kernels == "":
- tile_descriptions = [tile_descriptions[0]]
-
- operations = []
-
- for tile in tile_descriptions:
- for conv_kind in conv_kinds:
- for iterator_algorithm in iterator_algorithms:
- A = TensorDescription(element_a, layout[0], alignment)
- B = TensorDescription(element_b, layout[1], alignment)
- C = TensorDescription(element_c, layout[2], alignment_c)
-
- # unity stride only for Optimized Dgrad
- if (iterator_algorithm == IteratorAlgorithm.Optimized) and (
- conv_kind == ConvKind.Dgrad
- ):
- new_operation = Conv2dOperation(
- conv_kind,
- iterator_algorithm,
- tile.minimum_compute_capability,
- tile,
- A,
- B,
- C,
- element_epilogue,
- StrideSupport.Unity,
- epilogue_functor,
- )
-
- manifest.append(new_operation)
- operations.append(new_operation)
-
- # strided dgrad is not supported by Optimized Dgrad
- if (iterator_algorithm == IteratorAlgorithm.Optimized) and (
- conv_kind == ConvKind.Dgrad
- ):
- continue
-
- # strided support for Fprop (Analytic/Optimized), Dgrad (Analytic), and Wgrad (Analytic)
- new_operation = Conv2dOperation(
- conv_kind,
- iterator_algorithm,
- tile.minimum_compute_capability,
- tile,
- A,
- B,
- C,
- element_epilogue,
- StrideSupport.Strided,
- epilogue_functor,
- )
-
- manifest.append(new_operation)
- operations.append(new_operation)
-
- return operations
-
-
- ###################################################################################################
- ###################################################################################################
-
-
- def GenerateConv2d_Simt(args):
- operations = []
-
- layouts = [(LayoutType.TensorNC4HW4, LayoutType.TensorC4RSK4)]
-
- math_instructions = [
- MathInstruction(
- [1, 1, 4],
- DataType.s8,
- DataType.s8,
- DataType.s32,
- OpcodeClass.Simt,
- MathOperation.multiply_add,
- )
- ]
-
- dst_layouts = [
- LayoutType.TensorNC4HW4,
- LayoutType.TensorNC32HW32,
- LayoutType.TensorNHWC,
- LayoutType.TensorNHWC,
- LayoutType.TensorNCHW,
- ]
-
- dst_types = [DataType.s8, DataType.s8, DataType.u4, DataType.s4, DataType.f32]
-
- max_cc = 1024
-
- for math_inst in math_instructions:
- for layout in layouts:
- for dst_type, dst_layout in zip(dst_types, dst_layouts):
- if dst_type == DataType.s4 or dst_type == DataType.u4:
- min_cc = 75
- use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt
- else:
- min_cc = 61
- use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity
- tile_descriptions = [
- TileDescription(
- [128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 32, 32], 2, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [32, 64, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [64, 32, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- ]
- for tile in tile_descriptions:
- if (
- dst_layout == LayoutType.TensorNC32HW32
- and tile.threadblock_shape[0] > 32
- ):
- continue
- if (
- dst_layout == LayoutType.TensorNCHW
- or dst_layout == LayoutType.TensorNHWC
- ) and tile.threadblock_shape[0] > 16:
- continue
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- 32,
- 32,
- 32,
- use_special_optimization,
- )
- return operations
-
-
- def GenerateConv2d_TensorOp_8816(args):
- operations = []
-
- layouts = [(LayoutType.TensorNC32HW32, LayoutType.TensorC32RSK32)]
-
- math_instructions = [
- MathInstruction(
- [8, 8, 16],
- DataType.s8,
- DataType.s8,
- DataType.s32,
- OpcodeClass.TensorOp,
- MathOperation.multiply_add_saturate,
- )
- ]
-
- dst_layouts = [LayoutType.TensorNC32HW32, LayoutType.TensorNC4HW4]
-
- dst_types = [DataType.s8, DataType.s8]
-
- use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity
-
- min_cc = 75
- max_cc = 1024
-
- cuda_major = 10
- cuda_minor = 2
-
- for math_inst in math_instructions:
- for layout in layouts:
- for dst_type, dst_layout in zip(dst_types, dst_layouts):
- if dst_layout == LayoutType.TensorNC32HW32:
- tile_descriptions = [
- TileDescription(
- [128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- ]
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- tile_descriptions,
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- 128,
- 128,
- 64,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- True,
- cuda_major,
- cuda_minor,
- )
- else:
- assert dst_layout == LayoutType.TensorNC4HW4
- tile_descriptions = [
- TileDescription(
- [64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc
- ),
- ]
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- tile_descriptions,
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- 128,
- 128,
- 64,
- use_special_optimization,
- ImplicitGemmMode.GemmNT,
- False,
- cuda_major,
- cuda_minor,
- )
-
- layouts_nhwc = [
- (LayoutType.TensorNHWC, LayoutType.TensorNC4HW4, 32),
- (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 64),
- (LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 128),
- ]
-
- dst_layouts_nhwc = [LayoutType.TensorNHWC]
-
- for math_inst in math_instructions:
- for layout in layouts_nhwc:
- for dst_layout in dst_layouts_nhwc:
- dst_type = math_inst.element_b
- tile_descriptions = [
- TileDescription(
- [128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- ]
- for tile in tile_descriptions:
- dst_align = 32 if tile.threadblock_shape[1] == 16 else 64
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- layout[2],
- layout[2],
- dst_align,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- False,
- cuda_major,
- cuda_minor,
- )
- if (
- tile.threadblock_shape[1] == 16
- or tile.threadblock_shape[1] == 32
- ):
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- layout[2],
- layout[2],
- dst_align,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- True,
- cuda_major,
- cuda_minor,
- )
-
- out_dtypes = [DataType.s4, DataType.u4, DataType.f32]
-
- # INT8x8x4 and INT8x8x32
- for math_inst in math_instructions:
- for layout in layouts_nhwc:
- for dst_layout in dst_layouts_nhwc:
- for out_dtype in out_dtypes:
- tile_descriptions = [
- TileDescription(
- [128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- ]
- for tile in tile_descriptions:
- dst_align = (
- 4 * DataTypeSize[out_dtype]
- if tile.threadblock_shape[1] == 16
- or out_dtype == DataType.f32
- else 8 * DataTypeSize[out_dtype]
- )
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- out_dtype,
- min_cc,
- layout[2],
- layout[2],
- dst_align,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- False,
- cuda_major,
- cuda_minor,
- )
- if tile.threadblock_shape[1] == 16 or (
- tile.threadblock_shape[1] == 32
- and out_dtype != DataType.f32
- ):
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- out_dtype,
- min_cc,
- layout[2],
- layout[2],
- dst_align,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- True,
- cuda_major,
- cuda_minor,
- )
-
- return operations
-
-
- def GenerateConv2d_TensorOp_8832(args):
- operations = []
-
- layouts = [(LayoutType.TensorNC64HW64, LayoutType.TensorC64RSK64)]
-
- math_instructions = [
- MathInstruction(
- [8, 8, 32],
- DataType.s4,
- DataType.s4,
- DataType.s32,
- OpcodeClass.TensorOp,
- MathOperation.multiply_add_saturate,
- ),
- MathInstruction(
- [8, 8, 32],
- DataType.s4,
- DataType.u4,
- DataType.s32,
- OpcodeClass.TensorOp,
- MathOperation.multiply_add_saturate,
- ),
- ]
-
- dst_layouts = [LayoutType.TensorNC64HW64]
-
- use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity
-
- min_cc = 75
- max_cc = 1024
-
- cuda_major = 10
- cuda_minor = 2
-
- for math_inst in math_instructions:
- for layout in layouts:
- for dst_layout in dst_layouts:
- dst_type = math_inst.element_b
- tile_descriptions = [
- TileDescription(
- [128, 256, 128], 2, [2, 4, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 128, 128], 2, [2, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 64, 128], 2, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- ]
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- tile_descriptions,
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- 128,
- 128,
- 64,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- True,
- cuda_major,
- cuda_minor,
- )
-
- layouts_nhwc = [
- (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32),
- (LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 64),
- (LayoutType.TensorNHWC, LayoutType.TensorNC32HW32, 128),
- ]
-
- dst_layouts_nhwc = [LayoutType.TensorNHWC]
-
- for math_inst in math_instructions:
- for layout in layouts_nhwc:
- for dst_layout in dst_layouts_nhwc:
- dst_type = math_inst.element_b
- tile_descriptions = [
- TileDescription(
- [128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- ]
- for tile in tile_descriptions:
- dst_align = 16 if tile.threadblock_shape[1] == 16 else 32
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- layout[2],
- layout[2],
- dst_align,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- False,
- cuda_major,
- cuda_minor,
- )
- if (
- tile.threadblock_shape[1] == 32
- or tile.threadblock_shape[1] == 64
- ):
- dst_align = 32 if tile.threadblock_shape[1] == 32 else 64
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- layout[2],
- layout[2],
- dst_align,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- True,
- cuda_major,
- cuda_minor,
- )
- # INT4x4x8
- for math_inst in math_instructions:
- for layout in layouts_nhwc:
- for dst_layout in dst_layouts_nhwc:
- tile_descriptions = [
- TileDescription(
- [128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- ]
- for tile in tile_descriptions:
- dst_align = 32 if tile.threadblock_shape[1] == 16 else 64
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- DataType.s8,
- min_cc,
- layout[2],
- layout[2],
- dst_align,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- False,
- cuda_major,
- cuda_minor,
- )
- if (
- tile.threadblock_shape[1] == 32
- or tile.threadblock_shape[1] == 64
- ):
- dst_align = 64 if tile.threadblock_shape[1] == 32 else 128
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Fprop,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- DataType.s8,
- min_cc,
- layout[2],
- layout[2],
- dst_align,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- True,
- cuda_major,
- cuda_minor,
- )
-
- return operations
-
-
- def GenerateDeconv_Simt(args):
- operations = []
-
- layouts = [(LayoutType.TensorNC4HW4, LayoutType.TensorK4RSC4)]
-
- math_instructions = [
- MathInstruction(
- [1, 1, 4],
- DataType.s8,
- DataType.s8,
- DataType.s32,
- OpcodeClass.Simt,
- MathOperation.multiply_add,
- )
- ]
-
- dst_layouts = [LayoutType.TensorNC4HW4]
-
- dst_types = [DataType.s8]
-
- use_special_optimization = SpecialOptimizeDesc.DeconvDoubleUpsampling
-
- min_cc = 61
- max_cc = 1024
-
- for math_inst in math_instructions:
- for layout in layouts:
- for dst_type, dst_layout in zip(dst_types, dst_layouts):
- tile_descriptions = [
- TileDescription(
- [32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- ]
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Dgrad,
- tile_descriptions,
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- 32,
- 32,
- 32,
- use_special_optimization,
- )
- return operations
-
-
- def GenerateDeconv_TensorOp_8816(args):
- operations = []
-
- layouts = [
- (LayoutType.TensorNHWC, LayoutType.TensorCK4RS4, 32),
- (LayoutType.TensorNHWC, LayoutType.TensorCK8RS8, 64),
- (LayoutType.TensorNHWC, LayoutType.TensorCK16RS16, 128),
- ]
-
- math_instructions = [
- MathInstruction(
- [8, 8, 16],
- DataType.s8,
- DataType.s8,
- DataType.s32,
- OpcodeClass.TensorOp,
- MathOperation.multiply_add_saturate,
- )
- ]
-
- dst_layouts = [LayoutType.TensorNHWC]
-
- dst_types = [DataType.s8]
-
- use_special_optimization = SpecialOptimizeDesc.DeconvDoubleUpsampling
-
- min_cc = 75
- max_cc = 1024
-
- cuda_major = 10
- cuda_minor = 2
-
- for math_inst in math_instructions:
- for layout in layouts:
- for dst_type, dst_layout in zip(dst_types, dst_layouts):
- tile_descriptions = [
- TileDescription(
- [128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc
- ),
- ]
- for tile in tile_descriptions:
- dst_align = 32 if tile.threadblock_shape[1] == 16 else 64
- operations += GenerateConv2d(
- ConvType.Convolution,
- ConvKind.Dgrad,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- layout[2],
- layout[2],
- dst_align,
- use_special_optimization,
- ImplicitGemmMode.GemmTN,
- False,
- cuda_major,
- cuda_minor,
- )
- return operations
-
-
- ################################################################################
- # parameters
- # Edge - for tiles, the edges represent the length of one side
- # Ratio - the maximum ratio between 2 edges, limits the skinnyness of tiles
- # MaxEdge - maximum length of each edge
- # Min/Max - minimum/maximum of the product of edge lengths
- ################################################################################
-
- warpsPerThreadblockEdge = [1, 2, 4, 8, 16]
- warpsPerThreadblockRatio = 2
- warpsPerThreadblockMax = 16
- # NOTE 1x32 and 2x16 warp tile shapes fail validation for ~10% of cases
-
- warpShapeEdges = [8, 16, 32, 64, 128, 256]
- warpShapeRatio = 4
- warpShapeMax = 64 * 64
- warpShapeMin = 8 * 8
-
- threadblockEdgeMax = 256
-
- # char, type bits/elem, max tile, L0 threadblock tiles
- precisions = {
- "c": ["cutlass::complex<float>", 64, 64 * 128, [[64, 128], [64, 32]]],
- "d": ["double", 64, 64 * 64, [[64, 64], [32, 32]]],
- "h": ["cutlass::half_t", 16, 128 * 256, [[256, 128], [64, 128], [64, 32]]],
- "i": ["int", 32, 128 * 128, [[128, 64], [16, 32]]],
- "s": ["float", 32, 128 * 128, [[128, 256], [128, 128], [64, 64]]],
- "z": ["cutlass::complex<double>", 128, 64 * 64, [[32, 64], [16, 32]]],
- }
- # L1 will have a single kernel for every unique shape
- # L2 will have everything else
- def GenerateGemm_Simt(args):
- ################################################################################
- # warps per threadblock
- ################################################################################
- warpsPerThreadblocks = []
- for warpsPerThreadblock0 in warpsPerThreadblockEdge:
- for warpsPerThreadblock1 in warpsPerThreadblockEdge:
- if (
- warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio
- and warpsPerThreadblock1 / warpsPerThreadblock0
- <= warpsPerThreadblockRatio
- and warpsPerThreadblock0 * warpsPerThreadblock1
- <= warpsPerThreadblockMax
- ):
- warpsPerThreadblocks.append(
- [warpsPerThreadblock0, warpsPerThreadblock1]
- )
-
- ################################################################################
- # warp shapes
- ################################################################################
- warpNumThreads = 32
- warpShapes = []
- for warp0 in warpShapeEdges:
- for warp1 in warpShapeEdges:
- if (
- warp0 / warp1 <= warpShapeRatio
- and warp1 / warp0 <= warpShapeRatio
- and warp0 * warp1 <= warpShapeMax
- and warp0 * warp1 > warpShapeMin
- ):
- warpShapes.append([warp0, warp1])
-
- # sgemm
- precisionType, precisionBits, threadblockMaxElements, threadblockTilesL0 = precisions[
- "s"
- ]
-
- layouts = [
- (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # nn
- (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor), # nt
- (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # tn
- (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), # tt
- ]
-
- math_instructions = [
- MathInstruction(
- [1, 1, 1],
- DataType.f32,
- DataType.f32,
- DataType.f32,
- OpcodeClass.Simt,
- MathOperation.multiply_add,
- )
- ]
-
- min_cc = 50
- max_cc = 1024
-
- operations = []
- for math_inst in math_instructions:
- for layout in layouts:
- data_type = [
- math_inst.element_a,
- math_inst.element_b,
- math_inst.element_accumulator,
- math_inst.element_accumulator,
- ]
- tile_descriptions = [
- TileDescription([64, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc),
- TileDescription([256, 64, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([32, 256, 8], 2, [2, 4, 1], math_inst, min_cc, max_cc),
- TileDescription([256, 32, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([32, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([64, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([32, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([8, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([16, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([16, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([16, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
- ]
- for warpsPerThreadblock in warpsPerThreadblocks:
- for warpShape in warpShapes:
- warpThreadsM = 0
- if warpShape[0] > warpShape[1]:
- warpThreadsM = 8
- else:
- warpThreadsM = 4
- warpThreadsN = warpNumThreads / warpThreadsM
-
- # skip shapes with conflicting rectangularity
- # they are unlikely to be fastest
- blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1]
- blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1]
- warpG = warpShape[0] > warpShape[1]
- warpL = warpShape[0] < warpShape[1]
-
- blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1] * 2
- blockL2 = warpsPerThreadblock[0] * 2 < warpsPerThreadblock[1]
- warpG2 = warpShape[0] > warpShape[1] * 2
- warpL2 = warpShape[0] * 2 < warpShape[1]
-
- if blockG2 and warpL:
- continue
- if blockL2 and warpG:
- continue
- if warpG2 and blockL:
- continue
- if warpL2 and blockG:
- continue
-
- # check threadblock ratios and max
- threadblockTile = [
- warpShape[0] * warpsPerThreadblock[0],
- warpShape[1] * warpsPerThreadblock[1],
- ]
- if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements:
- continue
- if threadblockTile[0] > threadblockEdgeMax:
- continue
- if threadblockTile[1] > threadblockEdgeMax:
- continue
- totalThreads = (
- warpNumThreads * warpsPerThreadblock[0] * warpsPerThreadblock[1]
- )
-
- # calculate unroll
- # ensure that every iteration at least a full load of A,B are done
- unrollMin = 8
- unrollMin0 = totalThreads // threadblockTile[0]
- unrollMin1 = totalThreads // threadblockTile[1]
- unroll = max(unrollMin, unrollMin0, unrollMin1)
-
- threadTileM = warpShape[0] // warpThreadsM
- threadTileN = warpShape[1] // warpThreadsN
- if threadTileM < 2 or threadTileN < 2:
- continue
- if threadTileM * threadTileN * precisionBits > 8 * 8 * 32:
- continue
-
- # epilogue currently only supports N < WarpNumThreads
- if threadblockTile[1] < warpNumThreads:
- continue
-
- # limit smem
- smemBitsA = threadblockTile[0] * unroll * 2 * precisionBits
- smemBitsB = threadblockTile[1] * unroll * 2 * precisionBits
- smemKBytes = (smemBitsA + smemBitsB) / 8 / 1024
- if smemKBytes > 48:
- continue
-
- tile = TileDescription(
- [threadblockTile[0], threadblockTile[1], unroll],
- 2,
- [
- threadblockTile[0] // warpShape[0],
- threadblockTile[1] // warpShape[1],
- 1,
- ],
- math_inst,
- min_cc,
- max_cc,
- )
-
- def filter(t: TileDescription) -> bool:
- nonlocal tile
- return (
- t.threadblock_shape[0] == tile.threadblock_shape[0]
- and t.threadblock_shape[1] == tile.threadblock_shape[1]
- and t.threadblock_shape[2] == tile.threadblock_shape[2]
- and t.warp_count[0] == tile.warp_count[0]
- and t.warp_count[1] == tile.warp_count[1]
- and t.warp_count[2] == tile.warp_count[2]
- and t.stages == tile.stages
- )
-
- if not any(t for t in tile_descriptions if filter(t)):
- continue
-
- operations += GeneratesGemm(
- tile, data_type, layout[0], layout[1], layout[2], min_cc
- )
- return operations
-
-
- #
- def GenerateDwconv2d_Simt(args, conv_kind):
- ################################################################################
- # warps per threadblock
- ################################################################################
- warpsPerThreadblocks = []
- for warpsPerThreadblock0 in warpsPerThreadblockEdge:
- for warpsPerThreadblock1 in warpsPerThreadblockEdge:
- if (
- warpsPerThreadblock0 / warpsPerThreadblock1 <= warpsPerThreadblockRatio
- and warpsPerThreadblock1 / warpsPerThreadblock0
- <= warpsPerThreadblockRatio
- and warpsPerThreadblock0 * warpsPerThreadblock1
- <= warpsPerThreadblockMax
- ):
- warpsPerThreadblocks.append(
- [warpsPerThreadblock0, warpsPerThreadblock1]
- )
-
- ################################################################################
- # warp shapes
- ################################################################################
- warpNumThreads = 32
- warpShapes = []
- for warp0 in warpShapeEdges:
- for warp1 in warpShapeEdges:
- if (
- warp0 / warp1 <= warpShapeRatio
- and warp1 / warp0 <= warpShapeRatio
- and warp0 * warp1 <= warpShapeMax
- and warp0 * warp1 > warpShapeMin
- ):
- warpShapes.append([warp0, warp1])
-
- # sgemm
- precisionType, precisionBits, threadblockMaxElements, threadblockTilesL0 = precisions[
- "s"
- ]
-
- layouts = [(LayoutType.TensorNCHW, LayoutType.TensorNCHW)]
-
- math_instructions = [
- MathInstruction(
- [1, 1, 1],
- DataType.f32,
- DataType.f32,
- DataType.f32,
- OpcodeClass.Simt,
- MathOperation.multiply_add,
- )
- ]
-
- min_cc = 50
- max_cc = 1024
-
- dst_layouts = [LayoutType.TensorNCHW]
-
- dst_types = [DataType.f32]
-
- alignment_constraints = [128, 32]
-
- operations = []
- for math_inst in math_instructions:
- tile_descriptions = [
- TileDescription([128, 128, 8], 2, [4, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([128, 64, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([64, 128, 8], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([128, 32, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([32, 128, 8], 2, [1, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([64, 64, 8], 2, [2, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([32, 64, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([64, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
- TileDescription([32, 32, 8], 2, [1, 1, 1], math_inst, min_cc, max_cc),
- ]
- for warpsPerThreadblock in warpsPerThreadblocks:
- for warpShape in warpShapes:
- warpThreadsM = 0
- if warpShape[0] > warpShape[1]:
- warpThreadsM = 8
- else:
- warpThreadsM = 4
- warpThreadsN = warpNumThreads / warpThreadsM
-
- # skip shapes with conflicting rectangularity
- # they are unlikely to be fastest
- blockG = warpsPerThreadblock[0] > warpsPerThreadblock[1]
- blockL = warpsPerThreadblock[0] < warpsPerThreadblock[1]
- warpG = warpShape[0] > warpShape[1]
- warpL = warpShape[0] < warpShape[1]
-
- blockG2 = warpsPerThreadblock[0] > warpsPerThreadblock[1] * 2
- blockL2 = warpsPerThreadblock[0] * 2 < warpsPerThreadblock[1]
- warpG2 = warpShape[0] > warpShape[1] * 2
- warpL2 = warpShape[0] * 2 < warpShape[1]
-
- if blockG2 and warpL:
- continue
- if blockL2 and warpG:
- continue
- if warpG2 and blockL:
- continue
- if warpL2 and blockG:
- continue
-
- # check threadblock ratios and max
- threadblockTile = [
- warpShape[0] * warpsPerThreadblock[0],
- warpShape[1] * warpsPerThreadblock[1],
- ]
- if threadblockTile[0] * threadblockTile[1] > threadblockMaxElements:
- continue
- if threadblockTile[0] > threadblockEdgeMax:
- continue
- if threadblockTile[1] > threadblockEdgeMax:
- continue
- totalThreads = (
- warpNumThreads * warpsPerThreadblock[0] * warpsPerThreadblock[1]
- )
-
- # calculate unroll
- # ensure that every iteration at least a full load of A,B are done
- unrollMin = 8
- unrollMin0 = totalThreads // threadblockTile[0]
- unrollMin1 = totalThreads // threadblockTile[1]
- unroll = max(unrollMin, unrollMin0, unrollMin1)
-
- threadTileM = warpShape[0] // warpThreadsM
- threadTileN = warpShape[1] // warpThreadsN
- if threadTileM < 2 or threadTileN < 2:
- continue
- if threadTileM * threadTileN * precisionBits > 8 * 8 * 32:
- continue
-
- # epilogue currently only supports N < WarpNumThreads
- if threadblockTile[1] < warpNumThreads:
- continue
-
- # limit smem
- smemBitsA = threadblockTile[0] * unroll * 2 * precisionBits
- smemBitsB = threadblockTile[1] * unroll * 2 * precisionBits
- smemKBytes = (smemBitsA + smemBitsB) / 8 / 1024
- if smemKBytes > 48:
- continue
-
- tile = TileDescription(
- [threadblockTile[0], threadblockTile[1], unroll],
- 2,
- [
- threadblockTile[0] // warpShape[0],
- threadblockTile[1] // warpShape[1],
- 1,
- ],
- math_inst,
- min_cc,
- max_cc,
- )
-
- def filter(t: TileDescription) -> bool:
- nonlocal tile
- return (
- t.threadblock_shape[0] == tile.threadblock_shape[0]
- and t.threadblock_shape[1] == tile.threadblock_shape[1]
- and t.threadblock_shape[2] == tile.threadblock_shape[2]
- and t.warp_count[0] == tile.warp_count[0]
- and t.warp_count[1] == tile.warp_count[1]
- and t.warp_count[2] == tile.warp_count[2]
- and t.stages == tile.stages
- )
-
- if not any(t for t in tile_descriptions if filter(t)):
- continue
-
- for layout in layouts:
- for dst_type, dst_layout in zip(dst_types, dst_layouts):
- for alignment_src in alignment_constraints:
- operations += GenerateConv2d(
- ConvType.DepthwiseConvolution,
- conv_kind,
- [tile],
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- alignment_src,
- 32,
- 32,
- SpecialOptimizeDesc.NoneSpecialOpt,
- ImplicitGemmMode.GemmTN,
- )
- return operations
-
-
- #
- def GenerateDwconv2d_TensorOp_884(args, conv_kind):
- layouts = [(LayoutType.TensorNCHW, LayoutType.TensorNCHW)]
-
- math_instructions = [
- MathInstruction(
- [8, 8, 4],
- DataType.f16,
- DataType.f16,
- DataType.f32,
- OpcodeClass.TensorOp,
- MathOperation.multiply_add,
- ),
- MathInstruction(
- [8, 8, 4],
- DataType.f16,
- DataType.f16,
- DataType.f16,
- OpcodeClass.TensorOp,
- MathOperation.multiply_add,
- ),
- ]
-
- min_cc = 70
- max_cc = 75
-
- dst_layouts = [LayoutType.TensorNCHW]
-
- dst_types = [DataType.f16]
-
- alignment_constraints = [128, 32, 16]
- cuda_major = 10
- cuda_minor = 2
-
- operations = []
- for math_inst in math_instructions:
- tile_descriptions = [
- TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
- TileDescription([128, 128, 32], 2, [4, 4, 1], math_inst, min_cc, max_cc),
- TileDescription([64, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
- TileDescription([128, 64, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
- TileDescription([64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- ]
- for layout in layouts:
- for dst_type, dst_layout in zip(dst_types, dst_layouts):
- for alignment_src in alignment_constraints:
- operations += GenerateConv2d(
- ConvType.DepthwiseConvolution,
- conv_kind,
- tile_descriptions,
- layout[0],
- layout[1],
- dst_layout,
- dst_type,
- min_cc,
- alignment_src,
- 16,
- 16,
- SpecialOptimizeDesc.NoneSpecialOpt,
- ImplicitGemmMode.GemmTN,
- False,
- cuda_major,
- cuda_minor,
- )
-
- return operations
-
-
- #
- def GenerateGemv_Simt(args):
- threadBlockShape_N = [128, 64, 32]
- ldgBits_A = [128, 64, 32]
- ldgBits_B = [128, 64, 32]
-
- layouts = [(LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor)]
-
- math_instructions = [
- MathInstruction(
- [1, 1, 1],
- DataType.f32,
- DataType.f32,
- DataType.f32,
- OpcodeClass.Simt,
- MathOperation.multiply_add,
- )
- ]
-
- min_cc = 50
-
- operations = []
- for math_inst in math_instructions:
- for layout in layouts:
- data_type = [
- math_inst.element_a,
- math_inst.element_b,
- math_inst.element_accumulator,
- math_inst.element_accumulator,
- ]
- for threadblock_shape_n in threadBlockShape_N:
- for align_a in ldgBits_A:
- for align_b in ldgBits_B:
- ldg_elements_a = align_a // DataTypeSize[math_inst.element_a]
- ldg_elements_b = align_b // DataTypeSize[math_inst.element_b]
- threadblock_shape_k = (256 * ldg_elements_a) // (
- threadblock_shape_n // ldg_elements_b
- )
- threadblock_shape = [
- 1,
- threadblock_shape_n,
- threadblock_shape_k,
- ]
- thread_shape = [1, ldg_elements_b, ldg_elements_a]
-
- operations.append(
- GeneratesGemv(
- math_inst,
- threadblock_shape,
- thread_shape,
- data_type,
- layout[0],
- layout[1],
- layout[2],
- min_cc,
- align_a,
- align_b,
- )
- )
- return operations
-
-
- #
- def GeneratesGemm_TensorOp_1688(args):
- layouts = [
- (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # nn
- (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor), # nt
- (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # tn
- (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), # tt
- ]
-
- math_instructions = [
- MathInstruction(
- [16, 8, 8],
- DataType.f16,
- DataType.f16,
- DataType.f32,
- OpcodeClass.TensorOp,
- MathOperation.multiply_add,
- ),
- MathInstruction(
- [16, 8, 8],
- DataType.f16,
- DataType.f16,
- DataType.f16,
- OpcodeClass.TensorOp,
- MathOperation.multiply_add,
- ),
- ]
-
- min_cc = 75
- max_cc = 1024
-
- alignment_constraints = [
- 8,
- 4,
- 2,
- # 1
- ]
- cuda_major = 10
- cuda_minor = 2
-
- operations = []
- for math_inst in math_instructions:
- for layout in layouts:
- for align in alignment_constraints:
- tile_descriptions = [
- TileDescription(
- [256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc
- ),
- ## comment some configuration to reduce compilation time and binary size
- # TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- # TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- # TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- ]
-
- data_type = [
- math_inst.element_a,
- math_inst.element_b,
- math_inst.element_a,
- math_inst.element_accumulator,
- ]
-
- for tile in tile_descriptions:
- operations += GeneratesGemm(
- tile,
- data_type,
- layout[0],
- layout[1],
- layout[2],
- min_cc,
- align * 16,
- align * 16,
- align * 16,
- cuda_major,
- cuda_minor,
- )
- return operations
-
-
- #
- def GeneratesGemm_TensorOp_884(args):
- layouts = [
- (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # nn
- (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor), # nt
- (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor), # tn
- (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor), # tt
- ]
-
- math_instructions = [
- MathInstruction(
- [8, 8, 4],
- DataType.f16,
- DataType.f16,
- DataType.f32,
- OpcodeClass.TensorOp,
- MathOperation.multiply_add,
- ),
- MathInstruction(
- [8, 8, 4],
- DataType.f16,
- DataType.f16,
- DataType.f16,
- OpcodeClass.TensorOp,
- MathOperation.multiply_add,
- ),
- ]
-
- min_cc = 70
- max_cc = 75
-
- alignment_constraints = [
- 8,
- 4,
- 2,
- # 1
- ]
- cuda_major = 10
- cuda_minor = 2
-
- operations = []
- for math_inst in math_instructions:
- for layout in layouts:
- for align in alignment_constraints:
- tile_descriptions = [
- TileDescription(
- [256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc
- ),
- TileDescription(
- [128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc
- ),
- ## comment some configuration to reduce compilation time and binary size
- # TileDescription([ 64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- # TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- # TileDescription([ 64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
- ]
-
- data_type = [
- math_inst.element_a,
- math_inst.element_b,
- math_inst.element_a,
- math_inst.element_accumulator,
- ]
-
- for tile in tile_descriptions:
- operations += GeneratesGemm(
- tile,
- data_type,
- layout[0],
- layout[1],
- layout[2],
- min_cc,
- align * 16,
- align * 16,
- align * 16,
- cuda_major,
- cuda_minor,
- )
-
- return operations
-
-
- #
- def GenerateConv2dOperations(args):
- if args.type == "simt":
- return GenerateConv2d_Simt(args)
- elif args.type == "tensorop8816":
- return GenerateConv2d_TensorOp_8816(args)
- else:
- assert args.type == "tensorop8832", (
- "operation conv2d only support"
- "simt, tensorop8816 and tensorop8832. (got:{})".format(args.type)
- )
- return GenerateConv2d_TensorOp_8832(args)
-
-
- def GenerateDeconvOperations(args):
- if args.type == "simt":
- return GenerateDeconv_Simt(args)
- else:
- assert args.type == "tensorop8816", (
- "operation deconv only support"
- "simt and tensorop8816. (got:{})".format(args.type)
- )
- return GenerateDeconv_TensorOp_8816(args)
-
-
- def GenerateDwconv2dFpropOperations(args):
- if args.type == "simt":
- return GenerateDwconv2d_Simt(args, ConvKind.Fprop)
- else:
- assert args.type == "tensorop884", (
- "operation dwconv2d fprop only support"
- "simt, tensorop884. (got:{})".format(args.type)
- )
- return GenerateDwconv2d_TensorOp_884(args, ConvKind.Fprop)
-
-
- def GenerateDwconv2dDgradOperations(args):
- if args.type == "simt":
- return GenerateDwconv2d_Simt(args, ConvKind.Dgrad)
- else:
- assert args.type == "tensorop884", (
- "operation dwconv2d fprop only support"
- "simt, tensorop884. (got:{})".format(args.type)
- )
- return GenerateDwconv2d_TensorOp_884(args, ConvKind.Dgrad)
-
-
- def GenerateGemmOperations(args):
- if args.type == "tensorop884":
- return GeneratesGemm_TensorOp_884(args)
- elif args.type == "tensorop1688":
- return GeneratesGemm_TensorOp_1688(args)
- else:
- assert (
- args.type == "simt"
- ), "operation gemm only support" "simt. (got:{})".format(args.type)
- return GenerateGemm_Simt(args)
-
-
- def GenerateGemvOperations(args):
- assert args.type == "simt", "operation gemv only support" "simt. (got:{})".format(
- args.type
- )
- return GenerateGemv_Simt(args)
-
-
- ###################################################################################################
- ###################################################################################################
-
- if __name__ == "__main__":
-
- parser = argparse.ArgumentParser(
- description="Generates device kernel registration code for CUTLASS Kernels"
- )
- parser.add_argument(
- "--operations",
- type=str,
- choices=[
- "gemm",
- "gemv",
- "conv2d",
- "deconv",
- "dwconv2d_fprop",
- "dwconv2d_dgrad",
- "dwconv2d_wgrad",
- ],
- required=True,
- help="Specifies the operation to generate (gemm, gemv, conv2d, deconv, dwconv2d_fprop, dwconv2d_dgrad, dwconv2d_wgrad)",
- )
- parser.add_argument(
- "output", type=str, help="output directory for CUTLASS kernel files"
- )
- parser.add_argument(
- "--type",
- type=str,
- choices=["simt", "tensorop8816", "tensorop8832", "tensorop884", "tensorop1688"],
- default="simt",
- help="kernel type of CUTLASS kernel generator",
- )
-
- gemv_wrapper_path = (
- "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper_batched_gemv_strided.cuinl"
- )
- short_path = (
- platform.system() == "Windows" or platform.system().find("NT") >= 0
- ) and ("true" != os.getenv("CUTLASS_WITH_LONG_PATH", default="False").lower())
- args = parser.parse_args()
-
- if args.operations == "gemm":
- operations = GenerateGemmOperations(args)
- elif args.operations == "gemv":
- operations = GenerateGemvOperations(args)
- elif args.operations == "conv2d":
- operations = GenerateConv2dOperations(args)
- elif args.operations == "deconv":
- operations = GenerateDeconvOperations(args)
- elif args.operations == "dwconv2d_fprop":
- operations = GenerateDwconv2dFpropOperations(args)
- elif args.operations == "dwconv2d_dgrad":
- operations = GenerateDwconv2dDgradOperations(args)
- elif args.operations == "dwconv2d_wgrad":
- pass
-
- if (
- args.operations == "conv2d"
- or args.operations == "deconv"
- or args.operations == "dwconv2d_fprop"
- or args.operations == "dwconv2d_dgrad"
- or args.operations == "dwconv2d_wgrad"
- ):
- for operation in operations:
- with EmitConvSingleKernelWrapper(
- args.output, operation, short_path
- ) as emitter:
- emitter.emit()
- elif args.operations == "gemm":
- for operation in operations:
- with EmitGemmSingleKernelWrapper(
- args.output, operation, short_path
- ) as emitter:
- emitter.emit()
- elif args.operations == "gemv":
- for operation in operations:
- with EmitGemvSingleKernelWrapper(
- args.output, operation, gemv_wrapper_path, short_path
- ) as emitter:
- emitter.emit()
-
- if args.operations != "gemv":
- GenerateManifest(args, operations, args.output)
-
- #
- ###################################################################################################
|