You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

list.bzl 118 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385
  1. # Generated by dnn/scripts/cutlass_generator/gen_list.py
  2. cutlass_gen_list = [
  3. "cutlass_simt_sgemm_8x32_8x2_nn_align1.cu",
  4. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu",
  5. "cutlass_simt_sgemm_16x32_8x2_nn_align1.cu",
  6. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu",
  7. "cutlass_simt_sgemm_16x64_8x2_nn_align1.cu",
  8. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu",
  9. "cutlass_simt_sgemm_32x32_8x2_nn_align1.cu",
  10. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu",
  11. "cutlass_simt_sgemm_32x64_8x2_nn_align1.cu",
  12. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu",
  13. "cutlass_simt_sgemm_64x32_8x2_nn_align1.cu",
  14. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu",
  15. "cutlass_simt_sgemm_16x128_8x2_nn_align1.cu",
  16. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu",
  17. "cutlass_simt_sgemm_32x128_8x2_nn_align1.cu",
  18. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu",
  19. "cutlass_simt_sgemm_64x64_8x2_nn_align1.cu",
  20. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu",
  21. "cutlass_simt_sgemm_128x32_8x2_nn_align1.cu",
  22. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu",
  23. "cutlass_simt_sgemm_64x128_8x2_nn_align1.cu",
  24. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu",
  25. "cutlass_simt_sgemm_128x64_8x2_nn_align1.cu",
  26. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu",
  27. "cutlass_simt_sgemm_32x256_8x2_nn_align1.cu",
  28. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu",
  29. "cutlass_simt_sgemm_64x256_8x2_nn_align1.cu",
  30. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu",
  31. "cutlass_simt_sgemm_128x128_8x2_nn_align1.cu",
  32. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu",
  33. "cutlass_simt_sgemm_256x32_8x2_nn_align1.cu",
  34. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu",
  35. "cutlass_simt_sgemm_256x64_8x2_nn_align1.cu",
  36. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu",
  37. "cutlass_simt_sgemm_8x32_8x2_nt_align1.cu",
  38. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu",
  39. "cutlass_simt_sgemm_16x32_8x2_nt_align1.cu",
  40. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu",
  41. "cutlass_simt_sgemm_16x64_8x2_nt_align1.cu",
  42. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu",
  43. "cutlass_simt_sgemm_32x32_8x2_nt_align1.cu",
  44. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu",
  45. "cutlass_simt_sgemm_32x64_8x2_nt_align1.cu",
  46. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu",
  47. "cutlass_simt_sgemm_64x32_8x2_nt_align1.cu",
  48. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu",
  49. "cutlass_simt_sgemm_16x128_8x2_nt_align1.cu",
  50. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu",
  51. "cutlass_simt_sgemm_32x128_8x2_nt_align1.cu",
  52. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu",
  53. "cutlass_simt_sgemm_64x64_8x2_nt_align1.cu",
  54. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu",
  55. "cutlass_simt_sgemm_128x32_8x2_nt_align1.cu",
  56. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu",
  57. "cutlass_simt_sgemm_64x128_8x2_nt_align1.cu",
  58. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu",
  59. "cutlass_simt_sgemm_128x64_8x2_nt_align1.cu",
  60. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu",
  61. "cutlass_simt_sgemm_32x256_8x2_nt_align1.cu",
  62. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu",
  63. "cutlass_simt_sgemm_64x256_8x2_nt_align1.cu",
  64. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu",
  65. "cutlass_simt_sgemm_128x128_8x2_nt_align1.cu",
  66. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu",
  67. "cutlass_simt_sgemm_256x32_8x2_nt_align1.cu",
  68. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu",
  69. "cutlass_simt_sgemm_256x64_8x2_nt_align1.cu",
  70. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu",
  71. "cutlass_simt_sgemm_8x32_8x2_tn_align1.cu",
  72. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu",
  73. "cutlass_simt_sgemm_16x32_8x2_tn_align1.cu",
  74. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu",
  75. "cutlass_simt_sgemm_16x64_8x2_tn_align1.cu",
  76. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu",
  77. "cutlass_simt_sgemm_32x32_8x2_tn_align1.cu",
  78. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu",
  79. "cutlass_simt_sgemm_32x64_8x2_tn_align1.cu",
  80. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu",
  81. "cutlass_simt_sgemm_64x32_8x2_tn_align1.cu",
  82. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu",
  83. "cutlass_simt_sgemm_16x128_8x2_tn_align1.cu",
  84. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu",
  85. "cutlass_simt_sgemm_32x128_8x2_tn_align1.cu",
  86. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu",
  87. "cutlass_simt_sgemm_64x64_8x2_tn_align1.cu",
  88. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu",
  89. "cutlass_simt_sgemm_128x32_8x2_tn_align1.cu",
  90. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu",
  91. "cutlass_simt_sgemm_64x128_8x2_tn_align1.cu",
  92. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu",
  93. "cutlass_simt_sgemm_128x64_8x2_tn_align1.cu",
  94. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu",
  95. "cutlass_simt_sgemm_32x256_8x2_tn_align1.cu",
  96. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu",
  97. "cutlass_simt_sgemm_64x256_8x2_tn_align1.cu",
  98. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu",
  99. "cutlass_simt_sgemm_128x128_8x2_tn_align1.cu",
  100. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu",
  101. "cutlass_simt_sgemm_256x32_8x2_tn_align1.cu",
  102. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu",
  103. "cutlass_simt_sgemm_256x64_8x2_tn_align1.cu",
  104. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu",
  105. "cutlass_simt_sgemm_8x32_8x2_tt_align1.cu",
  106. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu",
  107. "cutlass_simt_sgemm_16x32_8x2_tt_align1.cu",
  108. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu",
  109. "cutlass_simt_sgemm_16x64_8x2_tt_align1.cu",
  110. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu",
  111. "cutlass_simt_sgemm_32x32_8x2_tt_align1.cu",
  112. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu",
  113. "cutlass_simt_sgemm_32x64_8x2_tt_align1.cu",
  114. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu",
  115. "cutlass_simt_sgemm_64x32_8x2_tt_align1.cu",
  116. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu",
  117. "cutlass_simt_sgemm_16x128_8x2_tt_align1.cu",
  118. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu",
  119. "cutlass_simt_sgemm_32x128_8x2_tt_align1.cu",
  120. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu",
  121. "cutlass_simt_sgemm_64x64_8x2_tt_align1.cu",
  122. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu",
  123. "cutlass_simt_sgemm_128x32_8x2_tt_align1.cu",
  124. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu",
  125. "cutlass_simt_sgemm_64x128_8x2_tt_align1.cu",
  126. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu",
  127. "cutlass_simt_sgemm_128x64_8x2_tt_align1.cu",
  128. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu",
  129. "cutlass_simt_sgemm_32x256_8x2_tt_align1.cu",
  130. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu",
  131. "cutlass_simt_sgemm_64x256_8x2_tt_align1.cu",
  132. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu",
  133. "cutlass_simt_sgemm_128x128_8x2_tt_align1.cu",
  134. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu",
  135. "cutlass_simt_sgemm_256x32_8x2_tt_align1.cu",
  136. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu",
  137. "cutlass_simt_sgemm_256x64_8x2_tt_align1.cu",
  138. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu",
  139. "all_gemm_simt_operations.cu",
  140. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nn_align8.cu",
  141. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nn_align8.cu",
  142. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nn_align8.cu",
  143. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nn_align8.cu",
  144. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nn_align8.cu",
  145. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nn_align8.cu",
  146. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nn_align4.cu",
  147. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nn_align4.cu",
  148. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nn_align4.cu",
  149. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nn_align4.cu",
  150. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nn_align4.cu",
  151. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nn_align4.cu",
  152. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nn_align2.cu",
  153. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nn_align2.cu",
  154. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nn_align2.cu",
  155. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nn_align2.cu",
  156. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nn_align2.cu",
  157. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nn_align2.cu",
  158. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nt_align8.cu",
  159. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nt_align8.cu",
  160. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nt_align8.cu",
  161. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nt_align8.cu",
  162. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nt_align8.cu",
  163. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nt_align8.cu",
  164. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nt_align4.cu",
  165. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nt_align4.cu",
  166. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nt_align4.cu",
  167. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nt_align4.cu",
  168. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nt_align4.cu",
  169. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nt_align4.cu",
  170. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nt_align2.cu",
  171. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nt_align2.cu",
  172. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nt_align2.cu",
  173. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nt_align2.cu",
  174. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nt_align2.cu",
  175. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nt_align2.cu",
  176. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tn_align8.cu",
  177. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tn_align8.cu",
  178. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tn_align8.cu",
  179. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tn_align8.cu",
  180. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tn_align8.cu",
  181. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tn_align8.cu",
  182. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tn_align4.cu",
  183. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tn_align4.cu",
  184. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tn_align4.cu",
  185. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tn_align4.cu",
  186. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tn_align4.cu",
  187. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tn_align4.cu",
  188. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tn_align2.cu",
  189. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tn_align2.cu",
  190. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tn_align2.cu",
  191. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tn_align2.cu",
  192. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tn_align2.cu",
  193. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tn_align2.cu",
  194. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tt_align8.cu",
  195. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tt_align8.cu",
  196. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tt_align8.cu",
  197. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tt_align8.cu",
  198. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tt_align8.cu",
  199. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tt_align8.cu",
  200. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tt_align4.cu",
  201. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tt_align4.cu",
  202. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tt_align4.cu",
  203. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tt_align4.cu",
  204. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tt_align4.cu",
  205. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tt_align4.cu",
  206. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tt_align2.cu",
  207. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tt_align2.cu",
  208. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tt_align2.cu",
  209. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tt_align2.cu",
  210. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tt_align2.cu",
  211. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tt_align2.cu",
  212. "cutlass_tensorop_h1688gemm_256x128_32x2_nn_align8.cu",
  213. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nn_align8.cu",
  214. "cutlass_tensorop_h1688gemm_128x256_32x2_nn_align8.cu",
  215. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nn_align8.cu",
  216. "cutlass_tensorop_h1688gemm_128x128_32x2_nn_align8.cu",
  217. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nn_align8.cu",
  218. "cutlass_tensorop_h1688gemm_256x128_32x2_nn_align4.cu",
  219. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nn_align4.cu",
  220. "cutlass_tensorop_h1688gemm_128x256_32x2_nn_align4.cu",
  221. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nn_align4.cu",
  222. "cutlass_tensorop_h1688gemm_128x128_32x2_nn_align4.cu",
  223. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nn_align4.cu",
  224. "cutlass_tensorop_h1688gemm_256x128_32x2_nn_align2.cu",
  225. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nn_align2.cu",
  226. "cutlass_tensorop_h1688gemm_128x256_32x2_nn_align2.cu",
  227. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nn_align2.cu",
  228. "cutlass_tensorop_h1688gemm_128x128_32x2_nn_align2.cu",
  229. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nn_align2.cu",
  230. "cutlass_tensorop_h1688gemm_256x128_32x2_nt_align8.cu",
  231. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nt_align8.cu",
  232. "cutlass_tensorop_h1688gemm_128x256_32x2_nt_align8.cu",
  233. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nt_align8.cu",
  234. "cutlass_tensorop_h1688gemm_128x128_32x2_nt_align8.cu",
  235. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nt_align8.cu",
  236. "cutlass_tensorop_h1688gemm_256x128_32x2_nt_align4.cu",
  237. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nt_align4.cu",
  238. "cutlass_tensorop_h1688gemm_128x256_32x2_nt_align4.cu",
  239. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nt_align4.cu",
  240. "cutlass_tensorop_h1688gemm_128x128_32x2_nt_align4.cu",
  241. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nt_align4.cu",
  242. "cutlass_tensorop_h1688gemm_256x128_32x2_nt_align2.cu",
  243. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nt_align2.cu",
  244. "cutlass_tensorop_h1688gemm_128x256_32x2_nt_align2.cu",
  245. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nt_align2.cu",
  246. "cutlass_tensorop_h1688gemm_128x128_32x2_nt_align2.cu",
  247. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nt_align2.cu",
  248. "cutlass_tensorop_h1688gemm_256x128_32x2_tn_align8.cu",
  249. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tn_align8.cu",
  250. "cutlass_tensorop_h1688gemm_128x256_32x2_tn_align8.cu",
  251. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tn_align8.cu",
  252. "cutlass_tensorop_h1688gemm_128x128_32x2_tn_align8.cu",
  253. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tn_align8.cu",
  254. "cutlass_tensorop_h1688gemm_256x128_32x2_tn_align4.cu",
  255. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tn_align4.cu",
  256. "cutlass_tensorop_h1688gemm_128x256_32x2_tn_align4.cu",
  257. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tn_align4.cu",
  258. "cutlass_tensorop_h1688gemm_128x128_32x2_tn_align4.cu",
  259. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tn_align4.cu",
  260. "cutlass_tensorop_h1688gemm_256x128_32x2_tn_align2.cu",
  261. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tn_align2.cu",
  262. "cutlass_tensorop_h1688gemm_128x256_32x2_tn_align2.cu",
  263. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tn_align2.cu",
  264. "cutlass_tensorop_h1688gemm_128x128_32x2_tn_align2.cu",
  265. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tn_align2.cu",
  266. "cutlass_tensorop_h1688gemm_256x128_32x2_tt_align8.cu",
  267. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tt_align8.cu",
  268. "cutlass_tensorop_h1688gemm_128x256_32x2_tt_align8.cu",
  269. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tt_align8.cu",
  270. "cutlass_tensorop_h1688gemm_128x128_32x2_tt_align8.cu",
  271. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tt_align8.cu",
  272. "cutlass_tensorop_h1688gemm_256x128_32x2_tt_align4.cu",
  273. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tt_align4.cu",
  274. "cutlass_tensorop_h1688gemm_128x256_32x2_tt_align4.cu",
  275. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tt_align4.cu",
  276. "cutlass_tensorop_h1688gemm_128x128_32x2_tt_align4.cu",
  277. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tt_align4.cu",
  278. "cutlass_tensorop_h1688gemm_256x128_32x2_tt_align2.cu",
  279. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tt_align2.cu",
  280. "cutlass_tensorop_h1688gemm_128x256_32x2_tt_align2.cu",
  281. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tt_align2.cu",
  282. "cutlass_tensorop_h1688gemm_128x128_32x2_tt_align2.cu",
  283. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tt_align2.cu",
  284. "all_gemm_tensorop1688_operations.cu",
  285. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nn_align8.cu",
  286. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nn_align8.cu",
  287. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nn_align8.cu",
  288. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nn_align8.cu",
  289. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nn_align8.cu",
  290. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nn_align8.cu",
  291. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nn_align4.cu",
  292. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nn_align4.cu",
  293. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nn_align4.cu",
  294. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nn_align4.cu",
  295. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nn_align4.cu",
  296. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nn_align4.cu",
  297. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nn_align2.cu",
  298. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nn_align2.cu",
  299. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nn_align2.cu",
  300. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nn_align2.cu",
  301. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nn_align2.cu",
  302. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nn_align2.cu",
  303. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nt_align8.cu",
  304. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nt_align8.cu",
  305. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nt_align8.cu",
  306. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nt_align8.cu",
  307. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nt_align8.cu",
  308. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nt_align8.cu",
  309. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nt_align4.cu",
  310. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nt_align4.cu",
  311. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nt_align4.cu",
  312. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nt_align4.cu",
  313. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nt_align4.cu",
  314. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nt_align4.cu",
  315. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nt_align2.cu",
  316. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nt_align2.cu",
  317. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nt_align2.cu",
  318. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nt_align2.cu",
  319. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nt_align2.cu",
  320. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nt_align2.cu",
  321. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tn_align8.cu",
  322. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tn_align8.cu",
  323. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tn_align8.cu",
  324. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tn_align8.cu",
  325. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tn_align8.cu",
  326. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tn_align8.cu",
  327. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tn_align4.cu",
  328. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tn_align4.cu",
  329. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tn_align4.cu",
  330. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tn_align4.cu",
  331. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tn_align4.cu",
  332. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tn_align4.cu",
  333. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tn_align2.cu",
  334. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tn_align2.cu",
  335. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tn_align2.cu",
  336. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tn_align2.cu",
  337. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tn_align2.cu",
  338. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tn_align2.cu",
  339. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tt_align8.cu",
  340. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tt_align8.cu",
  341. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tt_align8.cu",
  342. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tt_align8.cu",
  343. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tt_align8.cu",
  344. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tt_align8.cu",
  345. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tt_align4.cu",
  346. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tt_align4.cu",
  347. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tt_align4.cu",
  348. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tt_align4.cu",
  349. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tt_align4.cu",
  350. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tt_align4.cu",
  351. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tt_align2.cu",
  352. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tt_align2.cu",
  353. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tt_align2.cu",
  354. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tt_align2.cu",
  355. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tt_align2.cu",
  356. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tt_align2.cu",
  357. "cutlass_tensorop_h884gemm_256x128_32x2_nn_align8.cu",
  358. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nn_align8.cu",
  359. "cutlass_tensorop_h884gemm_128x256_32x2_nn_align8.cu",
  360. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nn_align8.cu",
  361. "cutlass_tensorop_h884gemm_128x128_32x2_nn_align8.cu",
  362. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nn_align8.cu",
  363. "cutlass_tensorop_h884gemm_256x128_32x2_nn_align4.cu",
  364. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nn_align4.cu",
  365. "cutlass_tensorop_h884gemm_128x256_32x2_nn_align4.cu",
  366. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nn_align4.cu",
  367. "cutlass_tensorop_h884gemm_128x128_32x2_nn_align4.cu",
  368. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nn_align4.cu",
  369. "cutlass_tensorop_h884gemm_256x128_32x2_nn_align2.cu",
  370. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nn_align2.cu",
  371. "cutlass_tensorop_h884gemm_128x256_32x2_nn_align2.cu",
  372. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nn_align2.cu",
  373. "cutlass_tensorop_h884gemm_128x128_32x2_nn_align2.cu",
  374. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nn_align2.cu",
  375. "cutlass_tensorop_h884gemm_256x128_32x2_nt_align8.cu",
  376. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nt_align8.cu",
  377. "cutlass_tensorop_h884gemm_128x256_32x2_nt_align8.cu",
  378. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nt_align8.cu",
  379. "cutlass_tensorop_h884gemm_128x128_32x2_nt_align8.cu",
  380. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nt_align8.cu",
  381. "cutlass_tensorop_h884gemm_256x128_32x2_nt_align4.cu",
  382. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nt_align4.cu",
  383. "cutlass_tensorop_h884gemm_128x256_32x2_nt_align4.cu",
  384. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nt_align4.cu",
  385. "cutlass_tensorop_h884gemm_128x128_32x2_nt_align4.cu",
  386. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nt_align4.cu",
  387. "cutlass_tensorop_h884gemm_256x128_32x2_nt_align2.cu",
  388. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nt_align2.cu",
  389. "cutlass_tensorop_h884gemm_128x256_32x2_nt_align2.cu",
  390. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nt_align2.cu",
  391. "cutlass_tensorop_h884gemm_128x128_32x2_nt_align2.cu",
  392. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nt_align2.cu",
  393. "cutlass_tensorop_h884gemm_256x128_32x2_tn_align8.cu",
  394. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tn_align8.cu",
  395. "cutlass_tensorop_h884gemm_128x256_32x2_tn_align8.cu",
  396. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tn_align8.cu",
  397. "cutlass_tensorop_h884gemm_128x128_32x2_tn_align8.cu",
  398. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tn_align8.cu",
  399. "cutlass_tensorop_h884gemm_256x128_32x2_tn_align4.cu",
  400. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tn_align4.cu",
  401. "cutlass_tensorop_h884gemm_128x256_32x2_tn_align4.cu",
  402. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tn_align4.cu",
  403. "cutlass_tensorop_h884gemm_128x128_32x2_tn_align4.cu",
  404. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tn_align4.cu",
  405. "cutlass_tensorop_h884gemm_256x128_32x2_tn_align2.cu",
  406. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tn_align2.cu",
  407. "cutlass_tensorop_h884gemm_128x256_32x2_tn_align2.cu",
  408. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tn_align2.cu",
  409. "cutlass_tensorop_h884gemm_128x128_32x2_tn_align2.cu",
  410. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tn_align2.cu",
  411. "cutlass_tensorop_h884gemm_256x128_32x2_tt_align8.cu",
  412. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tt_align8.cu",
  413. "cutlass_tensorop_h884gemm_128x256_32x2_tt_align8.cu",
  414. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tt_align8.cu",
  415. "cutlass_tensorop_h884gemm_128x128_32x2_tt_align8.cu",
  416. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tt_align8.cu",
  417. "cutlass_tensorop_h884gemm_256x128_32x2_tt_align4.cu",
  418. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tt_align4.cu",
  419. "cutlass_tensorop_h884gemm_128x256_32x2_tt_align4.cu",
  420. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tt_align4.cu",
  421. "cutlass_tensorop_h884gemm_128x128_32x2_tt_align4.cu",
  422. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tt_align4.cu",
  423. "cutlass_tensorop_h884gemm_256x128_32x2_tt_align2.cu",
  424. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tt_align2.cu",
  425. "cutlass_tensorop_h884gemm_128x256_32x2_tt_align2.cu",
  426. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tt_align2.cu",
  427. "cutlass_tensorop_h884gemm_128x128_32x2_tt_align2.cu",
  428. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tt_align2.cu",
  429. "all_gemm_tensorop884_operations.cu",
  430. "cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu",
  431. "cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu",
  432. "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu",
  433. "cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu",
  434. "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu",
  435. "cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu",
  436. "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu",
  437. "cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu",
  438. "cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu",
  439. "cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu",
  440. "cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu",
  441. "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu",
  442. "cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu",
  443. "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu",
  444. "cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu",
  445. "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu",
  446. "cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu",
  447. "cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu",
  448. "cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu",
  449. "cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu",
  450. "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu",
  451. "cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu",
  452. "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu",
  453. "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu",
  454. "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu",
  455. "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu",
  456. "cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu",
  457. "cutlass_simt_s8_idgrad_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4_align4x16.cu",
  458. "cutlass_simt_s8_idgrad_s2_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4_align4x16.cu",
  459. "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4_align4x4.cu",
  460. "cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4_align4x4.cu",
  461. "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4_align4x8.cu",
  462. "cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4_align4x8.cu",
  463. "cutlass_simt_s8_idgrad_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4_align4x4.cu",
  464. "cutlass_simt_s8_idgrad_s2_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4_align4x4.cu",
  465. "all_deconv_simt_operations.cu",
  466. "cutlass_tensorop_s8_i8816dgrad_id_s8_128x32x32_64x32x32_1_nhwc_ck4rs4_align4x4.cu",
  467. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_128x32x32_64x32x32_1_nhwc_ck4rs4_align4x4.cu",
  468. "cutlass_tensorop_s8_i8816dgrad_id_s8_64x16x32_64x16x32_2_nhwc_ck4rs4_align4x4.cu",
  469. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_64x16x32_64x16x32_2_nhwc_ck4rs4_align4x4.cu",
  470. "cutlass_tensorop_s8_i8816dgrad_id_s8_128x32x32_64x32x32_1_nhwc_ck8rs8_align8x8.cu",
  471. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_128x32x32_64x32x32_1_nhwc_ck8rs8_align8x8.cu",
  472. "cutlass_tensorop_s8_i8816dgrad_id_s8_64x16x32_64x16x32_2_nhwc_ck8rs8_align8x8.cu",
  473. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_64x16x32_64x16x32_2_nhwc_ck8rs8_align8x8.cu",
  474. "cutlass_tensorop_s8_i8816dgrad_id_s8_128x32x32_64x32x32_1_nhwc_ck16rs16_align16x16.cu",
  475. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_128x32x32_64x32x32_1_nhwc_ck16rs16_align16x16.cu",
  476. "cutlass_tensorop_s8_i8816dgrad_id_s8_64x16x32_64x16x32_2_nhwc_ck16rs16_align16x16.cu",
  477. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_64x16x32_64x16x32_2_nhwc_ck16rs16_align16x16.cu",
  478. "all_deconv_tensorop8816_operations.cu",
  479. "cutlass_simt_s8_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  480. "cutlass_simt_s8_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  481. "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  482. "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  483. "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  484. "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  485. "cutlass_simt_s8_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  486. "cutlass_simt_s8_ifprop_1x1_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  487. "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  488. "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  489. "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  490. "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  491. "cutlass_simt_s8_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  492. "cutlass_simt_s8_ifprop_1x1_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  493. "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  494. "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  495. "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  496. "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  497. "cutlass_simt_s8_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  498. "cutlass_simt_s8_ifprop_1x1_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  499. "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  500. "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  501. "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  502. "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  503. "cutlass_simt_s8_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  504. "cutlass_simt_s8_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  505. "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  506. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  507. "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  508. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  509. "cutlass_simt_s8_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  510. "cutlass_simt_s8_ifprop_1x1_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  511. "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  512. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  513. "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  514. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  515. "cutlass_simt_s8_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  516. "cutlass_simt_s8_ifprop_1x1_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  517. "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  518. "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  519. "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  520. "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  521. "cutlass_simt_s8_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  522. "cutlass_simt_s8_ifprop_1x1_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  523. "cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  524. "cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  525. "cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  526. "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  527. "cutlass_simt_s8_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  528. "cutlass_simt_s8_ifprop_1x1_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  529. "cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  530. "cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  531. "cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  532. "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  533. "cutlass_simt_s8_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  534. "cutlass_simt_s8_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  535. "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  536. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  537. "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  538. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  539. "cutlass_simt_s8_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  540. "cutlass_simt_s8_ifprop_1x1_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  541. "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  542. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  543. "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  544. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  545. "cutlass_simt_u4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  546. "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  547. "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  548. "cutlass_simt_u4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  549. "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  550. "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  551. "cutlass_simt_s4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  552. "cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  553. "cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  554. "cutlass_simt_s4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  555. "cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  556. "cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  557. "cutlass_simt_f32_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  558. "cutlass_simt_f32_ifprop_1x1_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  559. "cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  560. "cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  561. "cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  562. "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  563. "cutlass_simt_f32_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  564. "cutlass_simt_f32_ifprop_1x1_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  565. "cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  566. "cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  567. "cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  568. "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  569. "all_conv2d_simt_operations.cu",
  570. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  571. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  572. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  573. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  574. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  575. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  576. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  577. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  578. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  579. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  580. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  581. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  582. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  583. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  584. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  585. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  586. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  587. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  588. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  589. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  590. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  591. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  592. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  593. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  594. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  595. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  596. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  597. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  598. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  599. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  600. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  601. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  602. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  603. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  604. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  605. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  606. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  607. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  608. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  609. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  610. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  611. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  612. "cutlass_tensorop_s8_i8816fprop_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  613. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  614. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  615. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  616. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  617. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  618. "cutlass_tensorop_s8_i8816fprop_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  619. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  620. "cutlass_tensorop_s8_i8816fprop_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  621. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  622. "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  623. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  624. "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  625. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  626. "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  627. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  628. "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  629. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  630. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  631. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  632. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  633. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  634. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  635. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  636. "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  637. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  638. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  639. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  640. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  641. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  642. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  643. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  644. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  645. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  646. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  647. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  648. "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  649. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  650. "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  651. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  652. "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  653. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  654. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  655. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  656. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  657. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  658. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  659. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  660. "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  661. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  662. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  663. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  664. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  665. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  666. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  667. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  668. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  669. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  670. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  671. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  672. "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  673. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  674. "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  675. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  676. "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  677. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  678. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  679. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  680. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  681. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  682. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  683. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  684. "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  685. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  686. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  687. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  688. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  689. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  690. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  691. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  692. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  693. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  694. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  695. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  696. "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  697. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  698. "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  699. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  700. "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  701. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  702. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  703. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  704. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  705. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  706. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  707. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  708. "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  709. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  710. "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  711. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  712. "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  713. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  714. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  715. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  716. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  717. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  718. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  719. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  720. "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  721. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  722. "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  723. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  724. "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  725. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  726. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  727. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  728. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  729. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  730. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  731. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  732. "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  733. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  734. "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  735. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  736. "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  737. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  738. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  739. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  740. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  741. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  742. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  743. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  744. "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  745. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  746. "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  747. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  748. "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  749. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  750. "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  751. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  752. "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  753. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  754. "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  755. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  756. "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  757. "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  758. "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  759. "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  760. "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  761. "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  762. "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  763. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  764. "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  765. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  766. "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  767. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  768. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  769. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  770. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  771. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  772. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  773. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  774. "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  775. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  776. "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  777. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  778. "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  779. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  780. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  781. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  782. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  783. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  784. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  785. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  786. "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  787. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  788. "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  789. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  790. "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  791. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  792. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  793. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  794. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  795. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  796. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  797. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  798. "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  799. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  800. "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  801. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  802. "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  803. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  804. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  805. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  806. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  807. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  808. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  809. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  810. "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  811. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  812. "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  813. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  814. "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  815. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  816. "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  817. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  818. "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  819. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  820. "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  821. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  822. "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  823. "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  824. "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  825. "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  826. "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  827. "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  828. "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  829. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  830. "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  831. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  832. "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  833. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  834. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  835. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  836. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  837. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  838. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  839. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  840. "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  841. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  842. "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  843. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  844. "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  845. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  846. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  847. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  848. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  849. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  850. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  851. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  852. "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  853. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  854. "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  855. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  856. "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  857. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  858. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  859. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  860. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  861. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  862. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  863. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  864. "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  865. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  866. "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  867. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  868. "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  869. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  870. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  871. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  872. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  873. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  874. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  875. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  876. "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  877. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  878. "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  879. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  880. "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  881. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  882. "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  883. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  884. "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  885. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  886. "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  887. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  888. "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  889. "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  890. "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  891. "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  892. "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  893. "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  894. "all_conv2d_tensorop8816_operations.cu",
  895. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  896. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  897. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  898. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  899. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  900. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  901. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  902. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  903. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  904. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  905. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  906. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  907. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  908. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  909. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  910. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  911. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  912. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  913. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  914. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  915. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  916. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  917. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  918. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  919. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  920. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  921. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  922. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  923. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  924. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  925. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  926. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  927. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  928. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  929. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  930. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  931. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  932. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  933. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  934. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  935. "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  936. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  937. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  938. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  939. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  940. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  941. "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  942. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  943. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  944. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  945. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  946. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  947. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  948. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  949. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  950. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  951. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  952. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  953. "cutlass_tensorop_s4_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  954. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  955. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  956. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  957. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  958. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  959. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  960. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  961. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  962. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  963. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  964. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  965. "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  966. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  967. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  968. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  969. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  970. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  971. "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  972. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  973. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  974. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  975. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  976. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  977. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  978. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  979. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  980. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  981. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  982. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  983. "cutlass_tensorop_s4_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  984. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  985. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  986. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  987. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  988. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  989. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  990. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  991. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  992. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  993. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  994. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  995. "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  996. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  997. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  998. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  999. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1000. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1001. "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1002. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1003. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1004. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1005. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1006. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1007. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1008. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1009. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1010. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1011. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1012. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1013. "cutlass_tensorop_s4_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1014. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1015. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1016. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1017. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1018. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1019. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1020. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1021. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1022. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1023. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1024. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1025. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1026. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1027. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1028. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1029. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1030. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1031. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1032. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1033. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1034. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1035. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1036. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1037. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1038. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1039. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1040. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1041. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1042. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1043. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1044. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1045. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1046. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1047. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1048. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1049. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1050. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1051. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1052. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1053. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1054. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1055. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1056. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1057. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1058. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1059. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1060. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1061. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1062. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1063. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1064. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1065. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1066. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1067. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1068. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1069. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1070. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1071. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1072. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1073. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1074. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1075. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1076. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1077. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1078. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1079. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1080. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1081. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1082. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1083. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1084. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1085. "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1086. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1087. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1088. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1089. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1090. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1091. "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1092. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1093. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1094. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1095. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1096. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1097. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1098. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1099. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1100. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1101. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1102. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1103. "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1104. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1105. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1106. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1107. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1108. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1109. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1110. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1111. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1112. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1113. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1114. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1115. "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1116. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1117. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1118. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1119. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1120. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1121. "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1122. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1123. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1124. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1125. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1126. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1127. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1128. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1129. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1130. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1131. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1132. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1133. "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1134. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1135. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1136. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1137. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1138. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1139. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1140. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1141. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1142. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1143. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1144. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1145. "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1146. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1147. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1148. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1149. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1150. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1151. "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1152. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1153. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1154. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1155. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1156. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1157. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1158. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1159. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1160. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1161. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1162. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1163. "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1164. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1165. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1166. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1167. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1168. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1169. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1170. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1171. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1172. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1173. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1174. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1175. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1176. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1177. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1178. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1179. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1180. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1181. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1182. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1183. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1184. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1185. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1186. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1187. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1188. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1189. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1190. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1191. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1192. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1193. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1194. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1195. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1196. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1197. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1198. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1199. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1200. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1201. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1202. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1203. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1204. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1205. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1206. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1207. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1208. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1209. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1210. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1211. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1212. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1213. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1214. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1215. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1216. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1217. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1218. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1219. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1220. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1221. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1222. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1223. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1224. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1225. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1226. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1227. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1228. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1229. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1230. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1231. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1232. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1233. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1234. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1235. "all_conv2d_tensorop8832_operations.cu",
  1236. "cutlass_simt_sdwfprop_id_f32_32x32x8_32x32x8_2_nchw_nchw_align4x1.cu",
  1237. "cutlass_simt_sdwfprop_relu_f32_32x32x8_32x32x8_2_nchw_nchw_align4x1.cu",
  1238. "cutlass_simt_sdwfprop_id_f32_32x32x8_32x32x8_2_nchw_nchw_align1x1.cu",
  1239. "cutlass_simt_sdwfprop_relu_f32_32x32x8_32x32x8_2_nchw_nchw_align1x1.cu",
  1240. "cutlass_simt_sdwfprop_id_f32_32x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1241. "cutlass_simt_sdwfprop_relu_f32_32x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1242. "cutlass_simt_sdwfprop_id_f32_32x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1243. "cutlass_simt_sdwfprop_relu_f32_32x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1244. "cutlass_simt_sdwfprop_id_f32_64x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1245. "cutlass_simt_sdwfprop_relu_f32_64x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1246. "cutlass_simt_sdwfprop_id_f32_64x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1247. "cutlass_simt_sdwfprop_relu_f32_64x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1248. "cutlass_simt_sdwfprop_id_f32_32x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1249. "cutlass_simt_sdwfprop_relu_f32_32x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1250. "cutlass_simt_sdwfprop_id_f32_32x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1251. "cutlass_simt_sdwfprop_relu_f32_32x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1252. "cutlass_simt_sdwfprop_id_f32_64x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1253. "cutlass_simt_sdwfprop_relu_f32_64x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1254. "cutlass_simt_sdwfprop_id_f32_64x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1255. "cutlass_simt_sdwfprop_relu_f32_64x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1256. "cutlass_simt_sdwfprop_id_f32_128x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1257. "cutlass_simt_sdwfprop_relu_f32_128x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1258. "cutlass_simt_sdwfprop_id_f32_128x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1259. "cutlass_simt_sdwfprop_relu_f32_128x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1260. "cutlass_simt_sdwfprop_id_f32_64x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1261. "cutlass_simt_sdwfprop_relu_f32_64x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1262. "cutlass_simt_sdwfprop_id_f32_64x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1263. "cutlass_simt_sdwfprop_relu_f32_64x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1264. "cutlass_simt_sdwfprop_id_f32_128x64x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1265. "cutlass_simt_sdwfprop_relu_f32_128x64x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1266. "cutlass_simt_sdwfprop_id_f32_128x64x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1267. "cutlass_simt_sdwfprop_relu_f32_128x64x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1268. "cutlass_simt_sdwfprop_id_f32_128x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1269. "cutlass_simt_sdwfprop_relu_f32_128x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1270. "cutlass_simt_sdwfprop_id_f32_128x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1271. "cutlass_simt_sdwfprop_relu_f32_128x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1272. "all_dwconv2d_fprop_simt_operations.cu",
  1273. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1274. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1275. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1276. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1277. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1278. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1279. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1280. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1281. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1282. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1283. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1284. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1285. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1286. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1287. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1288. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1289. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1290. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1291. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1292. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1293. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1294. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1295. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1296. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1297. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1298. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1299. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1300. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1301. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1302. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1303. "cutlass_tensorop_h884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1304. "cutlass_tensorop_h884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1305. "cutlass_tensorop_h884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1306. "cutlass_tensorop_h884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1307. "cutlass_tensorop_h884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1308. "cutlass_tensorop_h884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1309. "cutlass_tensorop_h884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1310. "cutlass_tensorop_h884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1311. "cutlass_tensorop_h884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1312. "cutlass_tensorop_h884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1313. "cutlass_tensorop_h884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1314. "cutlass_tensorop_h884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1315. "cutlass_tensorop_h884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1316. "cutlass_tensorop_h884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1317. "cutlass_tensorop_h884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1318. "cutlass_tensorop_h884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1319. "cutlass_tensorop_h884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1320. "cutlass_tensorop_h884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1321. "cutlass_tensorop_h884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1322. "cutlass_tensorop_h884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1323. "cutlass_tensorop_h884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1324. "cutlass_tensorop_h884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1325. "cutlass_tensorop_h884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1326. "cutlass_tensorop_h884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1327. "cutlass_tensorop_h884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1328. "cutlass_tensorop_h884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1329. "cutlass_tensorop_h884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1330. "cutlass_tensorop_h884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1331. "cutlass_tensorop_h884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1332. "cutlass_tensorop_h884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1333. "all_dwconv2d_fprop_tensorop884_operations.cu",
  1334. "cutlass_simt_sdwdgrad_id_f32_32x32x8_32x32x8_2_nchw_nchw_align4x1.cu",
  1335. "cutlass_simt_sdwdgrad_id_f32_32x32x8_32x32x8_2_nchw_nchw_align1x1.cu",
  1336. "cutlass_simt_sdwdgrad_id_f32_32x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1337. "cutlass_simt_sdwdgrad_id_f32_32x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1338. "cutlass_simt_sdwdgrad_id_f32_64x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1339. "cutlass_simt_sdwdgrad_id_f32_64x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1340. "cutlass_simt_sdwdgrad_id_f32_32x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1341. "cutlass_simt_sdwdgrad_id_f32_32x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1342. "cutlass_simt_sdwdgrad_id_f32_64x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1343. "cutlass_simt_sdwdgrad_id_f32_64x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1344. "cutlass_simt_sdwdgrad_id_f32_128x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1345. "cutlass_simt_sdwdgrad_id_f32_128x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1346. "cutlass_simt_sdwdgrad_id_f32_64x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1347. "cutlass_simt_sdwdgrad_id_f32_64x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1348. "cutlass_simt_sdwdgrad_id_f32_128x64x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1349. "cutlass_simt_sdwdgrad_id_f32_128x64x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1350. "cutlass_simt_sdwdgrad_id_f32_128x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1351. "cutlass_simt_sdwdgrad_id_f32_128x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1352. "all_dwconv2d_dgrad_simt_operations.cu",
  1353. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1354. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1355. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1356. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1357. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1358. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1359. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1360. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1361. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1362. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1363. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1364. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1365. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1366. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1367. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1368. "cutlass_tensorop_h884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1369. "cutlass_tensorop_h884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1370. "cutlass_tensorop_h884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1371. "cutlass_tensorop_h884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1372. "cutlass_tensorop_h884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1373. "cutlass_tensorop_h884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1374. "cutlass_tensorop_h884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1375. "cutlass_tensorop_h884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1376. "cutlass_tensorop_h884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1377. "cutlass_tensorop_h884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1378. "cutlass_tensorop_h884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1379. "cutlass_tensorop_h884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1380. "cutlass_tensorop_h884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1381. "cutlass_tensorop_h884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1382. "cutlass_tensorop_h884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1383. "all_dwconv2d_dgrad_tensorop884_operations.cu",
  1384. ]