You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

list.bzl 123 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441
  1. # Generated by dnn/scripts/cutlass_generator/gen_list.py
  2. cutlass_gen_list = [
  3. "cutlass_simt_sgemm_8x32_8x2_nn_align1.cu",
  4. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu",
  5. "cutlass_simt_sgemm_16x32_8x2_nn_align1.cu",
  6. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu",
  7. "cutlass_simt_sgemm_16x64_8x2_nn_align1.cu",
  8. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu",
  9. "cutlass_simt_sgemm_32x32_8x2_nn_align1.cu",
  10. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu",
  11. "cutlass_simt_sgemm_32x64_8x2_nn_align1.cu",
  12. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu",
  13. "cutlass_simt_sgemm_64x32_8x2_nn_align1.cu",
  14. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu",
  15. "cutlass_simt_sgemm_16x128_8x2_nn_align1.cu",
  16. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu",
  17. "cutlass_simt_sgemm_32x128_8x2_nn_align1.cu",
  18. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu",
  19. "cutlass_simt_sgemm_64x64_8x2_nn_align1.cu",
  20. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu",
  21. "cutlass_simt_sgemm_128x32_8x2_nn_align1.cu",
  22. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu",
  23. "cutlass_simt_sgemm_64x128_8x2_nn_align1.cu",
  24. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu",
  25. "cutlass_simt_sgemm_128x64_8x2_nn_align1.cu",
  26. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu",
  27. "cutlass_simt_sgemm_32x256_8x2_nn_align1.cu",
  28. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu",
  29. "cutlass_simt_sgemm_64x256_8x2_nn_align1.cu",
  30. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu",
  31. "cutlass_simt_sgemm_128x128_8x2_nn_align1.cu",
  32. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu",
  33. "cutlass_simt_sgemm_256x32_8x2_nn_align1.cu",
  34. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu",
  35. "cutlass_simt_sgemm_256x64_8x2_nn_align1.cu",
  36. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu",
  37. "cutlass_simt_sgemm_8x32_8x2_nt_align1.cu",
  38. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu",
  39. "cutlass_simt_sgemm_16x32_8x2_nt_align1.cu",
  40. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu",
  41. "cutlass_simt_sgemm_16x64_8x2_nt_align1.cu",
  42. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu",
  43. "cutlass_simt_sgemm_32x32_8x2_nt_align1.cu",
  44. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu",
  45. "cutlass_simt_sgemm_32x64_8x2_nt_align1.cu",
  46. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu",
  47. "cutlass_simt_sgemm_64x32_8x2_nt_align1.cu",
  48. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu",
  49. "cutlass_simt_sgemm_16x128_8x2_nt_align1.cu",
  50. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu",
  51. "cutlass_simt_sgemm_32x128_8x2_nt_align1.cu",
  52. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu",
  53. "cutlass_simt_sgemm_64x64_8x2_nt_align1.cu",
  54. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu",
  55. "cutlass_simt_sgemm_128x32_8x2_nt_align1.cu",
  56. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu",
  57. "cutlass_simt_sgemm_64x128_8x2_nt_align1.cu",
  58. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu",
  59. "cutlass_simt_sgemm_128x64_8x2_nt_align1.cu",
  60. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu",
  61. "cutlass_simt_sgemm_32x256_8x2_nt_align1.cu",
  62. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu",
  63. "cutlass_simt_sgemm_64x256_8x2_nt_align1.cu",
  64. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu",
  65. "cutlass_simt_sgemm_128x128_8x2_nt_align1.cu",
  66. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu",
  67. "cutlass_simt_sgemm_256x32_8x2_nt_align1.cu",
  68. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu",
  69. "cutlass_simt_sgemm_256x64_8x2_nt_align1.cu",
  70. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu",
  71. "cutlass_simt_sgemm_8x32_8x2_tn_align1.cu",
  72. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu",
  73. "cutlass_simt_sgemm_16x32_8x2_tn_align1.cu",
  74. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu",
  75. "cutlass_simt_sgemm_16x64_8x2_tn_align1.cu",
  76. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu",
  77. "cutlass_simt_sgemm_32x32_8x2_tn_align1.cu",
  78. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu",
  79. "cutlass_simt_sgemm_32x64_8x2_tn_align1.cu",
  80. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu",
  81. "cutlass_simt_sgemm_64x32_8x2_tn_align1.cu",
  82. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu",
  83. "cutlass_simt_sgemm_16x128_8x2_tn_align1.cu",
  84. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu",
  85. "cutlass_simt_sgemm_32x128_8x2_tn_align1.cu",
  86. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu",
  87. "cutlass_simt_sgemm_64x64_8x2_tn_align1.cu",
  88. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu",
  89. "cutlass_simt_sgemm_128x32_8x2_tn_align1.cu",
  90. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu",
  91. "cutlass_simt_sgemm_64x128_8x2_tn_align1.cu",
  92. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu",
  93. "cutlass_simt_sgemm_128x64_8x2_tn_align1.cu",
  94. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu",
  95. "cutlass_simt_sgemm_32x256_8x2_tn_align1.cu",
  96. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu",
  97. "cutlass_simt_sgemm_64x256_8x2_tn_align1.cu",
  98. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu",
  99. "cutlass_simt_sgemm_128x128_8x2_tn_align1.cu",
  100. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu",
  101. "cutlass_simt_sgemm_256x32_8x2_tn_align1.cu",
  102. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu",
  103. "cutlass_simt_sgemm_256x64_8x2_tn_align1.cu",
  104. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu",
  105. "cutlass_simt_sgemm_8x32_8x2_tt_align1.cu",
  106. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu",
  107. "cutlass_simt_sgemm_16x32_8x2_tt_align1.cu",
  108. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu",
  109. "cutlass_simt_sgemm_16x64_8x2_tt_align1.cu",
  110. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu",
  111. "cutlass_simt_sgemm_32x32_8x2_tt_align1.cu",
  112. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu",
  113. "cutlass_simt_sgemm_32x64_8x2_tt_align1.cu",
  114. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu",
  115. "cutlass_simt_sgemm_64x32_8x2_tt_align1.cu",
  116. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu",
  117. "cutlass_simt_sgemm_16x128_8x2_tt_align1.cu",
  118. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu",
  119. "cutlass_simt_sgemm_32x128_8x2_tt_align1.cu",
  120. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu",
  121. "cutlass_simt_sgemm_64x64_8x2_tt_align1.cu",
  122. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu",
  123. "cutlass_simt_sgemm_128x32_8x2_tt_align1.cu",
  124. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu",
  125. "cutlass_simt_sgemm_64x128_8x2_tt_align1.cu",
  126. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu",
  127. "cutlass_simt_sgemm_128x64_8x2_tt_align1.cu",
  128. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu",
  129. "cutlass_simt_sgemm_32x256_8x2_tt_align1.cu",
  130. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu",
  131. "cutlass_simt_sgemm_64x256_8x2_tt_align1.cu",
  132. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu",
  133. "cutlass_simt_sgemm_128x128_8x2_tt_align1.cu",
  134. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu",
  135. "cutlass_simt_sgemm_256x32_8x2_tt_align1.cu",
  136. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu",
  137. "cutlass_simt_sgemm_256x64_8x2_tt_align1.cu",
  138. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu",
  139. "all_gemm_simt_operations.cu",
  140. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nn_align8.cu",
  141. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nn_align8.cu",
  142. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nn_align8.cu",
  143. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nn_align8.cu",
  144. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nn_align8.cu",
  145. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nn_align8.cu",
  146. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nn_align4.cu",
  147. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nn_align4.cu",
  148. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nn_align4.cu",
  149. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nn_align4.cu",
  150. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nn_align4.cu",
  151. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nn_align4.cu",
  152. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nn_align2.cu",
  153. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nn_align2.cu",
  154. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nn_align2.cu",
  155. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nn_align2.cu",
  156. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nn_align2.cu",
  157. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nn_align2.cu",
  158. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nt_align8.cu",
  159. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nt_align8.cu",
  160. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nt_align8.cu",
  161. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nt_align8.cu",
  162. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nt_align8.cu",
  163. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nt_align8.cu",
  164. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nt_align4.cu",
  165. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nt_align4.cu",
  166. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nt_align4.cu",
  167. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nt_align4.cu",
  168. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nt_align4.cu",
  169. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nt_align4.cu",
  170. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nt_align2.cu",
  171. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nt_align2.cu",
  172. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nt_align2.cu",
  173. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nt_align2.cu",
  174. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nt_align2.cu",
  175. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nt_align2.cu",
  176. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tn_align8.cu",
  177. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tn_align8.cu",
  178. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tn_align8.cu",
  179. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tn_align8.cu",
  180. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tn_align8.cu",
  181. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tn_align8.cu",
  182. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tn_align4.cu",
  183. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tn_align4.cu",
  184. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tn_align4.cu",
  185. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tn_align4.cu",
  186. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tn_align4.cu",
  187. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tn_align4.cu",
  188. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tn_align2.cu",
  189. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tn_align2.cu",
  190. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tn_align2.cu",
  191. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tn_align2.cu",
  192. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tn_align2.cu",
  193. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tn_align2.cu",
  194. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tt_align8.cu",
  195. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tt_align8.cu",
  196. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tt_align8.cu",
  197. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tt_align8.cu",
  198. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tt_align8.cu",
  199. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tt_align8.cu",
  200. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tt_align4.cu",
  201. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tt_align4.cu",
  202. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tt_align4.cu",
  203. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tt_align4.cu",
  204. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tt_align4.cu",
  205. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tt_align4.cu",
  206. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tt_align2.cu",
  207. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tt_align2.cu",
  208. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tt_align2.cu",
  209. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tt_align2.cu",
  210. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tt_align2.cu",
  211. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tt_align2.cu",
  212. "cutlass_tensorop_h1688gemm_256x128_32x2_nn_align8.cu",
  213. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nn_align8.cu",
  214. "cutlass_tensorop_h1688gemm_128x256_32x2_nn_align8.cu",
  215. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nn_align8.cu",
  216. "cutlass_tensorop_h1688gemm_128x128_32x2_nn_align8.cu",
  217. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nn_align8.cu",
  218. "cutlass_tensorop_h1688gemm_256x128_32x2_nn_align4.cu",
  219. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nn_align4.cu",
  220. "cutlass_tensorop_h1688gemm_128x256_32x2_nn_align4.cu",
  221. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nn_align4.cu",
  222. "cutlass_tensorop_h1688gemm_128x128_32x2_nn_align4.cu",
  223. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nn_align4.cu",
  224. "cutlass_tensorop_h1688gemm_256x128_32x2_nn_align2.cu",
  225. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nn_align2.cu",
  226. "cutlass_tensorop_h1688gemm_128x256_32x2_nn_align2.cu",
  227. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nn_align2.cu",
  228. "cutlass_tensorop_h1688gemm_128x128_32x2_nn_align2.cu",
  229. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nn_align2.cu",
  230. "cutlass_tensorop_h1688gemm_256x128_32x2_nt_align8.cu",
  231. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nt_align8.cu",
  232. "cutlass_tensorop_h1688gemm_128x256_32x2_nt_align8.cu",
  233. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nt_align8.cu",
  234. "cutlass_tensorop_h1688gemm_128x128_32x2_nt_align8.cu",
  235. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nt_align8.cu",
  236. "cutlass_tensorop_h1688gemm_256x128_32x2_nt_align4.cu",
  237. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nt_align4.cu",
  238. "cutlass_tensorop_h1688gemm_128x256_32x2_nt_align4.cu",
  239. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nt_align4.cu",
  240. "cutlass_tensorop_h1688gemm_128x128_32x2_nt_align4.cu",
  241. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nt_align4.cu",
  242. "cutlass_tensorop_h1688gemm_256x128_32x2_nt_align2.cu",
  243. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nt_align2.cu",
  244. "cutlass_tensorop_h1688gemm_128x256_32x2_nt_align2.cu",
  245. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nt_align2.cu",
  246. "cutlass_tensorop_h1688gemm_128x128_32x2_nt_align2.cu",
  247. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nt_align2.cu",
  248. "cutlass_tensorop_h1688gemm_256x128_32x2_tn_align8.cu",
  249. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tn_align8.cu",
  250. "cutlass_tensorop_h1688gemm_128x256_32x2_tn_align8.cu",
  251. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tn_align8.cu",
  252. "cutlass_tensorop_h1688gemm_128x128_32x2_tn_align8.cu",
  253. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tn_align8.cu",
  254. "cutlass_tensorop_h1688gemm_256x128_32x2_tn_align4.cu",
  255. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tn_align4.cu",
  256. "cutlass_tensorop_h1688gemm_128x256_32x2_tn_align4.cu",
  257. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tn_align4.cu",
  258. "cutlass_tensorop_h1688gemm_128x128_32x2_tn_align4.cu",
  259. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tn_align4.cu",
  260. "cutlass_tensorop_h1688gemm_256x128_32x2_tn_align2.cu",
  261. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tn_align2.cu",
  262. "cutlass_tensorop_h1688gemm_128x256_32x2_tn_align2.cu",
  263. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tn_align2.cu",
  264. "cutlass_tensorop_h1688gemm_128x128_32x2_tn_align2.cu",
  265. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tn_align2.cu",
  266. "cutlass_tensorop_h1688gemm_256x128_32x2_tt_align8.cu",
  267. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tt_align8.cu",
  268. "cutlass_tensorop_h1688gemm_128x256_32x2_tt_align8.cu",
  269. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tt_align8.cu",
  270. "cutlass_tensorop_h1688gemm_128x128_32x2_tt_align8.cu",
  271. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tt_align8.cu",
  272. "cutlass_tensorop_h1688gemm_256x128_32x2_tt_align4.cu",
  273. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tt_align4.cu",
  274. "cutlass_tensorop_h1688gemm_128x256_32x2_tt_align4.cu",
  275. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tt_align4.cu",
  276. "cutlass_tensorop_h1688gemm_128x128_32x2_tt_align4.cu",
  277. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tt_align4.cu",
  278. "cutlass_tensorop_h1688gemm_256x128_32x2_tt_align2.cu",
  279. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tt_align2.cu",
  280. "cutlass_tensorop_h1688gemm_128x256_32x2_tt_align2.cu",
  281. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tt_align2.cu",
  282. "cutlass_tensorop_h1688gemm_128x128_32x2_tt_align2.cu",
  283. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tt_align2.cu",
  284. "all_gemm_tensorop1688_operations.cu",
  285. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nn_align8.cu",
  286. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nn_align8.cu",
  287. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nn_align8.cu",
  288. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nn_align8.cu",
  289. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nn_align8.cu",
  290. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nn_align8.cu",
  291. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nn_align4.cu",
  292. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nn_align4.cu",
  293. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nn_align4.cu",
  294. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nn_align4.cu",
  295. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nn_align4.cu",
  296. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nn_align4.cu",
  297. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nn_align2.cu",
  298. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nn_align2.cu",
  299. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nn_align2.cu",
  300. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nn_align2.cu",
  301. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nn_align2.cu",
  302. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nn_align2.cu",
  303. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nt_align8.cu",
  304. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nt_align8.cu",
  305. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nt_align8.cu",
  306. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nt_align8.cu",
  307. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nt_align8.cu",
  308. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nt_align8.cu",
  309. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nt_align4.cu",
  310. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nt_align4.cu",
  311. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nt_align4.cu",
  312. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nt_align4.cu",
  313. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nt_align4.cu",
  314. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nt_align4.cu",
  315. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nt_align2.cu",
  316. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nt_align2.cu",
  317. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nt_align2.cu",
  318. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nt_align2.cu",
  319. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nt_align2.cu",
  320. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nt_align2.cu",
  321. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tn_align8.cu",
  322. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tn_align8.cu",
  323. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tn_align8.cu",
  324. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tn_align8.cu",
  325. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tn_align8.cu",
  326. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tn_align8.cu",
  327. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tn_align4.cu",
  328. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tn_align4.cu",
  329. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tn_align4.cu",
  330. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tn_align4.cu",
  331. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tn_align4.cu",
  332. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tn_align4.cu",
  333. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tn_align2.cu",
  334. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tn_align2.cu",
  335. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tn_align2.cu",
  336. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tn_align2.cu",
  337. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tn_align2.cu",
  338. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tn_align2.cu",
  339. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tt_align8.cu",
  340. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tt_align8.cu",
  341. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tt_align8.cu",
  342. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tt_align8.cu",
  343. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tt_align8.cu",
  344. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tt_align8.cu",
  345. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tt_align4.cu",
  346. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tt_align4.cu",
  347. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tt_align4.cu",
  348. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tt_align4.cu",
  349. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tt_align4.cu",
  350. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tt_align4.cu",
  351. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tt_align2.cu",
  352. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tt_align2.cu",
  353. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tt_align2.cu",
  354. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tt_align2.cu",
  355. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tt_align2.cu",
  356. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tt_align2.cu",
  357. "cutlass_tensorop_h884gemm_256x128_32x2_nn_align8.cu",
  358. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nn_align8.cu",
  359. "cutlass_tensorop_h884gemm_128x256_32x2_nn_align8.cu",
  360. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nn_align8.cu",
  361. "cutlass_tensorop_h884gemm_128x128_32x2_nn_align8.cu",
  362. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nn_align8.cu",
  363. "cutlass_tensorop_h884gemm_256x128_32x2_nn_align4.cu",
  364. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nn_align4.cu",
  365. "cutlass_tensorop_h884gemm_128x256_32x2_nn_align4.cu",
  366. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nn_align4.cu",
  367. "cutlass_tensorop_h884gemm_128x128_32x2_nn_align4.cu",
  368. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nn_align4.cu",
  369. "cutlass_tensorop_h884gemm_256x128_32x2_nn_align2.cu",
  370. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nn_align2.cu",
  371. "cutlass_tensorop_h884gemm_128x256_32x2_nn_align2.cu",
  372. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nn_align2.cu",
  373. "cutlass_tensorop_h884gemm_128x128_32x2_nn_align2.cu",
  374. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nn_align2.cu",
  375. "cutlass_tensorop_h884gemm_256x128_32x2_nt_align8.cu",
  376. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nt_align8.cu",
  377. "cutlass_tensorop_h884gemm_128x256_32x2_nt_align8.cu",
  378. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nt_align8.cu",
  379. "cutlass_tensorop_h884gemm_128x128_32x2_nt_align8.cu",
  380. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nt_align8.cu",
  381. "cutlass_tensorop_h884gemm_256x128_32x2_nt_align4.cu",
  382. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nt_align4.cu",
  383. "cutlass_tensorop_h884gemm_128x256_32x2_nt_align4.cu",
  384. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nt_align4.cu",
  385. "cutlass_tensorop_h884gemm_128x128_32x2_nt_align4.cu",
  386. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nt_align4.cu",
  387. "cutlass_tensorop_h884gemm_256x128_32x2_nt_align2.cu",
  388. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nt_align2.cu",
  389. "cutlass_tensorop_h884gemm_128x256_32x2_nt_align2.cu",
  390. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nt_align2.cu",
  391. "cutlass_tensorop_h884gemm_128x128_32x2_nt_align2.cu",
  392. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nt_align2.cu",
  393. "cutlass_tensorop_h884gemm_256x128_32x2_tn_align8.cu",
  394. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tn_align8.cu",
  395. "cutlass_tensorop_h884gemm_128x256_32x2_tn_align8.cu",
  396. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tn_align8.cu",
  397. "cutlass_tensorop_h884gemm_128x128_32x2_tn_align8.cu",
  398. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tn_align8.cu",
  399. "cutlass_tensorop_h884gemm_256x128_32x2_tn_align4.cu",
  400. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tn_align4.cu",
  401. "cutlass_tensorop_h884gemm_128x256_32x2_tn_align4.cu",
  402. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tn_align4.cu",
  403. "cutlass_tensorop_h884gemm_128x128_32x2_tn_align4.cu",
  404. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tn_align4.cu",
  405. "cutlass_tensorop_h884gemm_256x128_32x2_tn_align2.cu",
  406. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tn_align2.cu",
  407. "cutlass_tensorop_h884gemm_128x256_32x2_tn_align2.cu",
  408. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tn_align2.cu",
  409. "cutlass_tensorop_h884gemm_128x128_32x2_tn_align2.cu",
  410. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tn_align2.cu",
  411. "cutlass_tensorop_h884gemm_256x128_32x2_tt_align8.cu",
  412. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tt_align8.cu",
  413. "cutlass_tensorop_h884gemm_128x256_32x2_tt_align8.cu",
  414. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tt_align8.cu",
  415. "cutlass_tensorop_h884gemm_128x128_32x2_tt_align8.cu",
  416. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tt_align8.cu",
  417. "cutlass_tensorop_h884gemm_256x128_32x2_tt_align4.cu",
  418. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tt_align4.cu",
  419. "cutlass_tensorop_h884gemm_128x256_32x2_tt_align4.cu",
  420. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tt_align4.cu",
  421. "cutlass_tensorop_h884gemm_128x128_32x2_tt_align4.cu",
  422. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tt_align4.cu",
  423. "cutlass_tensorop_h884gemm_256x128_32x2_tt_align2.cu",
  424. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tt_align2.cu",
  425. "cutlass_tensorop_h884gemm_128x256_32x2_tt_align2.cu",
  426. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tt_align2.cu",
  427. "cutlass_tensorop_h884gemm_128x128_32x2_tt_align2.cu",
  428. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tt_align2.cu",
  429. "all_gemm_tensorop884_operations.cu",
  430. "cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu",
  431. "cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu",
  432. "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu",
  433. "cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu",
  434. "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu",
  435. "cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu",
  436. "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu",
  437. "cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu",
  438. "cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu",
  439. "cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu",
  440. "cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu",
  441. "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu",
  442. "cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu",
  443. "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu",
  444. "cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu",
  445. "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu",
  446. "cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu",
  447. "cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu",
  448. "cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu",
  449. "cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu",
  450. "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu",
  451. "cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu",
  452. "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu",
  453. "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu",
  454. "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu",
  455. "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu",
  456. "cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu",
  457. "cutlass_simt_s8_idgrad_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4_align4x16.cu",
  458. "cutlass_simt_s8_idgrad_s2_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4_align4x16.cu",
  459. "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4_align4x4.cu",
  460. "cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4_align4x4.cu",
  461. "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4_align4x8.cu",
  462. "cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4_align4x8.cu",
  463. "cutlass_simt_s8_idgrad_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4_align4x4.cu",
  464. "cutlass_simt_s8_idgrad_s2_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4_align4x4.cu",
  465. "all_deconv_simt_operations.cu",
  466. "cutlass_tensorop_s8_i8816dgrad_id_s8_128x32x32_64x32x32_1_nhwc_ck4rs4_align4x4.cu",
  467. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_128x32x32_64x32x32_1_nhwc_ck4rs4_align4x4.cu",
  468. "cutlass_tensorop_s8_i8816dgrad_id_s8_64x16x32_64x16x32_2_nhwc_ck4rs4_align4x4.cu",
  469. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_64x16x32_64x16x32_2_nhwc_ck4rs4_align4x4.cu",
  470. "cutlass_tensorop_s8_i8816dgrad_id_s8_128x32x32_64x32x32_1_nhwc_ck8rs8_align8x8.cu",
  471. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_128x32x32_64x32x32_1_nhwc_ck8rs8_align8x8.cu",
  472. "cutlass_tensorop_s8_i8816dgrad_id_s8_64x16x32_64x16x32_2_nhwc_ck8rs8_align8x8.cu",
  473. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_64x16x32_64x16x32_2_nhwc_ck8rs8_align8x8.cu",
  474. "cutlass_tensorop_s8_i8816dgrad_id_s8_128x32x32_64x32x32_1_nhwc_ck16rs16_align16x16.cu",
  475. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_128x32x32_64x32x32_1_nhwc_ck16rs16_align16x16.cu",
  476. "cutlass_tensorop_s8_i8816dgrad_id_s8_64x16x32_64x16x32_2_nhwc_ck16rs16_align16x16.cu",
  477. "cutlass_tensorop_s8_i8816dgrad_s2_id_s8_64x16x32_64x16x32_2_nhwc_ck16rs16_align16x16.cu",
  478. "all_deconv_tensorop8816_operations.cu",
  479. "cutlass_simt_s8_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  480. "cutlass_simt_s8_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  481. "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  482. "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  483. "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  484. "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  485. "cutlass_simt_s8_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  486. "cutlass_simt_s8_ifprop_1x1_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  487. "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  488. "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  489. "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  490. "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  491. "cutlass_simt_s8_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  492. "cutlass_simt_s8_ifprop_1x1_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  493. "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  494. "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  495. "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  496. "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  497. "cutlass_simt_s8_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  498. "cutlass_simt_s8_ifprop_1x1_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  499. "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  500. "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  501. "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  502. "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  503. "cutlass_simt_s8_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  504. "cutlass_simt_s8_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  505. "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  506. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  507. "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  508. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  509. "cutlass_simt_s8_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  510. "cutlass_simt_s8_ifprop_1x1_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  511. "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  512. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  513. "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  514. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_align4x16.cu",
  515. "cutlass_simt_s8_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  516. "cutlass_simt_s8_ifprop_1x1_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  517. "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  518. "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  519. "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  520. "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4_align4x16.cu",
  521. "cutlass_simt_s8_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  522. "cutlass_simt_s8_ifprop_1x1_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  523. "cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  524. "cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  525. "cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  526. "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_align4x8.cu",
  527. "cutlass_simt_s8_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  528. "cutlass_simt_s8_ifprop_1x1_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  529. "cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  530. "cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  531. "cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  532. "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_align4x4.cu",
  533. "cutlass_simt_s8_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  534. "cutlass_simt_s8_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  535. "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  536. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  537. "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  538. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  539. "cutlass_simt_s8_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  540. "cutlass_simt_s8_ifprop_1x1_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  541. "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  542. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  543. "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  544. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32_align4x16.cu",
  545. "cutlass_simt_u4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  546. "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  547. "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  548. "cutlass_simt_u4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  549. "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  550. "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  551. "cutlass_simt_s4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  552. "cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  553. "cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc_align4x8.cu",
  554. "cutlass_simt_s4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  555. "cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  556. "cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc_align4x4.cu",
  557. "cutlass_simt_f32_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  558. "cutlass_simt_f32_ifprop_1x1_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  559. "cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  560. "cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  561. "cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  562. "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw_align4x8.cu",
  563. "cutlass_simt_f32_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  564. "cutlass_simt_f32_ifprop_1x1_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  565. "cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  566. "cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  567. "cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  568. "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw_align4x4.cu",
  569. "all_conv2d_simt_operations.cu",
  570. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  571. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  572. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  573. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  574. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  575. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  576. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  577. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  578. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  579. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  580. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  581. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  582. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  583. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  584. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  585. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  586. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  587. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  588. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  589. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  590. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  591. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  592. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  593. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32_align16x16.cu",
  594. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  595. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  596. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  597. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  598. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  599. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_align16x16.cu",
  600. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  601. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  602. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  603. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  604. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  605. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  606. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  607. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  608. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  609. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  610. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  611. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32_align16x16.cu",
  612. "cutlass_tensorop_s8_i8816fprop_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  613. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  614. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  615. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  616. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  617. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  618. "cutlass_tensorop_s8_i8816fprop_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  619. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  620. "cutlass_tensorop_s8_i8816fprop_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  621. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  622. "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  623. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4_align16x16.cu",
  624. "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  625. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  626. "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  627. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  628. "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  629. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  630. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  631. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  632. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  633. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  634. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  635. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  636. "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  637. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  638. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  639. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  640. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  641. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  642. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  643. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  644. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  645. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  646. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  647. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  648. "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  649. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  650. "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  651. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  652. "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  653. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  654. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  655. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  656. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  657. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  658. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  659. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  660. "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  661. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  662. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  663. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  664. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  665. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  666. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  667. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  668. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  669. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  670. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  671. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  672. "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  673. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  674. "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  675. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  676. "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  677. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  678. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  679. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  680. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  681. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  682. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  683. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  684. "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  685. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  686. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  687. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  688. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  689. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  690. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  691. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  692. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  693. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  694. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  695. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  696. "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  697. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  698. "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  699. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  700. "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  701. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  702. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  703. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  704. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  705. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  706. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  707. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  708. "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  709. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  710. "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  711. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  712. "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  713. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  714. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  715. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  716. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  717. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  718. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  719. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  720. "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  721. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  722. "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  723. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  724. "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  725. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  726. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  727. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  728. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  729. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  730. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  731. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  732. "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  733. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  734. "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  735. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  736. "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  737. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  738. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  739. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  740. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  741. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  742. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  743. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  744. "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  745. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  746. "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  747. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  748. "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  749. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4_align4x4.cu",
  750. "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  751. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  752. "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  753. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  754. "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  755. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  756. "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  757. "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  758. "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  759. "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  760. "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  761. "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4_align4x4.cu",
  762. "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  763. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  764. "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  765. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  766. "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  767. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  768. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  769. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  770. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  771. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  772. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  773. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  774. "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  775. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  776. "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  777. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  778. "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  779. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  780. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  781. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  782. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  783. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  784. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  785. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  786. "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  787. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  788. "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  789. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  790. "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  791. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  792. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  793. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  794. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  795. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  796. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  797. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  798. "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  799. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  800. "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  801. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  802. "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  803. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  804. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  805. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  806. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  807. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  808. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  809. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  810. "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  811. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  812. "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  813. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  814. "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  815. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8_align8x8.cu",
  816. "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  817. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  818. "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  819. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  820. "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  821. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  822. "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  823. "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  824. "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  825. "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  826. "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  827. "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8_align8x8.cu",
  828. "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  829. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  830. "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  831. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  832. "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  833. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  834. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  835. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  836. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  837. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  838. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  839. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  840. "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  841. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  842. "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  843. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  844. "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  845. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  846. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  847. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  848. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  849. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  850. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  851. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  852. "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  853. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  854. "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  855. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  856. "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  857. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  858. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  859. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  860. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  861. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  862. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  863. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  864. "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  865. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  866. "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  867. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  868. "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  869. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  870. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  871. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  872. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  873. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  874. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  875. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  876. "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  877. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  878. "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  879. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  880. "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  881. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16_align16x16.cu",
  882. "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  883. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  884. "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  885. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  886. "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  887. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  888. "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  889. "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  890. "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  891. "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  892. "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  893. "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16_align16x16.cu",
  894. "all_conv2d_tensorop8816_operations.cu",
  895. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  896. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  897. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  898. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  899. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  900. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  901. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  902. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  903. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  904. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  905. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  906. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  907. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  908. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  909. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  910. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  911. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  912. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  913. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  914. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  915. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  916. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  917. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  918. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  919. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  920. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  921. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  922. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  923. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  924. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  925. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  926. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  927. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  928. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  929. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  930. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64_align32x32.cu",
  931. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  932. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  933. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  934. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64_align32x32.cu",
  935. "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  936. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  937. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  938. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  939. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  940. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  941. "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  942. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  943. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  944. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  945. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  946. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  947. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  948. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  949. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  950. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  951. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  952. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  953. "cutlass_tensorop_s4_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  954. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  955. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  956. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  957. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  958. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  959. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  960. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  961. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  962. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  963. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  964. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  965. "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  966. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  967. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  968. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  969. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  970. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  971. "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  972. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  973. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  974. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  975. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  976. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  977. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  978. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  979. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  980. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  981. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  982. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  983. "cutlass_tensorop_s4_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  984. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  985. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  986. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  987. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  988. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  989. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  990. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  991. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  992. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  993. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  994. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  995. "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  996. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  997. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  998. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  999. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1000. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1001. "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1002. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1003. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1004. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1005. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1006. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1007. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1008. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1009. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1010. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1011. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1012. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1013. "cutlass_tensorop_s4_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1014. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1015. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1016. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1017. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1018. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1019. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1020. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1021. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1022. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1023. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1024. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1025. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1026. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1027. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1028. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1029. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1030. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1031. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1032. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1033. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1034. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1035. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1036. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1037. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1038. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1039. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1040. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1041. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1042. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1043. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1044. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1045. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1046. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1047. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1048. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1049. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1050. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1051. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1052. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1053. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1054. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1055. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1056. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1057. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1058. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1059. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1060. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1061. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1062. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1063. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1064. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1065. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1066. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1067. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1068. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1069. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1070. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1071. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1072. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1073. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1074. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1075. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1076. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1077. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1078. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1079. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1080. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1081. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1082. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1083. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1084. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1085. "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1086. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1087. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1088. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1089. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1090. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1091. "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1092. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1093. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1094. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1095. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1096. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1097. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1098. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1099. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1100. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1101. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1102. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1103. "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1104. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1105. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1106. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1107. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1108. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1109. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1110. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1111. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1112. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1113. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1114. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1115. "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1116. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1117. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1118. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1119. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1120. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1121. "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1122. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1123. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1124. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1125. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1126. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1127. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1128. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1129. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1130. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1131. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1132. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1133. "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1134. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1135. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1136. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1137. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1138. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1139. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1140. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1141. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1142. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1143. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1144. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1145. "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1146. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1147. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1148. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1149. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1150. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1151. "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1152. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1153. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1154. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1155. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1156. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1157. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1158. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1159. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1160. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1161. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1162. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1163. "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1164. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1165. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1166. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1167. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1168. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1169. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1170. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1171. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1172. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1173. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1174. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1175. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1176. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1177. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1178. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8_align8x8.cu",
  1179. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1180. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1181. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1182. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1183. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1184. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1185. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1186. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8_align8x8.cu",
  1187. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1188. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1189. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1190. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1191. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1192. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1193. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1194. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8_align8x8.cu",
  1195. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1196. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1197. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1198. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16_align16x16.cu",
  1199. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1200. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1201. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1202. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1203. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1204. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1205. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1206. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16_align16x16.cu",
  1207. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1208. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1209. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1210. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1211. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1212. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1213. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1214. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16_align16x16.cu",
  1215. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1216. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1217. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1218. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32_align32x32.cu",
  1219. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1220. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1221. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1222. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1223. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1224. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1225. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1226. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32_align32x32.cu",
  1227. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1228. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1229. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1230. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1231. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1232. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1233. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1234. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32_align32x32.cu",
  1235. "all_conv2d_tensorop8832_operations.cu",
  1236. "cutlass_simt_sdwfprop_id_f32_32x32x8_32x32x8_2_nchw_nchw_align4x1.cu",
  1237. "cutlass_simt_sdwfprop_relu_f32_32x32x8_32x32x8_2_nchw_nchw_align4x1.cu",
  1238. "cutlass_simt_sdwfprop_id_f32_32x32x8_32x32x8_2_nchw_nchw_align1x1.cu",
  1239. "cutlass_simt_sdwfprop_relu_f32_32x32x8_32x32x8_2_nchw_nchw_align1x1.cu",
  1240. "cutlass_simt_sdwfprop_id_f32_32x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1241. "cutlass_simt_sdwfprop_relu_f32_32x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1242. "cutlass_simt_sdwfprop_id_f32_32x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1243. "cutlass_simt_sdwfprop_relu_f32_32x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1244. "cutlass_simt_sdwfprop_id_f32_64x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1245. "cutlass_simt_sdwfprop_relu_f32_64x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1246. "cutlass_simt_sdwfprop_id_f32_64x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1247. "cutlass_simt_sdwfprop_relu_f32_64x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1248. "cutlass_simt_sdwfprop_id_f32_32x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1249. "cutlass_simt_sdwfprop_relu_f32_32x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1250. "cutlass_simt_sdwfprop_id_f32_32x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1251. "cutlass_simt_sdwfprop_relu_f32_32x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1252. "cutlass_simt_sdwfprop_id_f32_64x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1253. "cutlass_simt_sdwfprop_relu_f32_64x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1254. "cutlass_simt_sdwfprop_id_f32_64x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1255. "cutlass_simt_sdwfprop_relu_f32_64x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1256. "cutlass_simt_sdwfprop_id_f32_128x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1257. "cutlass_simt_sdwfprop_relu_f32_128x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1258. "cutlass_simt_sdwfprop_id_f32_128x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1259. "cutlass_simt_sdwfprop_relu_f32_128x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1260. "cutlass_simt_sdwfprop_id_f32_64x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1261. "cutlass_simt_sdwfprop_relu_f32_64x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1262. "cutlass_simt_sdwfprop_id_f32_64x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1263. "cutlass_simt_sdwfprop_relu_f32_64x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1264. "cutlass_simt_sdwfprop_id_f32_128x64x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1265. "cutlass_simt_sdwfprop_relu_f32_128x64x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1266. "cutlass_simt_sdwfprop_id_f32_128x64x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1267. "cutlass_simt_sdwfprop_relu_f32_128x64x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1268. "cutlass_simt_sdwfprop_id_f32_128x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1269. "cutlass_simt_sdwfprop_relu_f32_128x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1270. "cutlass_simt_sdwfprop_id_f32_128x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1271. "cutlass_simt_sdwfprop_relu_f32_128x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1272. "all_dwconv2d_fprop_simt_operations.cu",
  1273. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1274. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1275. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1276. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1277. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1278. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1279. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1280. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1281. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1282. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1283. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1284. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1285. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1286. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1287. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1288. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1289. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1290. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1291. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1292. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1293. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1294. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1295. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1296. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1297. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1298. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1299. "cutlass_tensorop_f16_s884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1300. "cutlass_tensorop_f16_s884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1301. "cutlass_tensorop_f16_s884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1302. "cutlass_tensorop_f16_s884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1303. "cutlass_tensorop_h884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1304. "cutlass_tensorop_h884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1305. "cutlass_tensorop_h884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1306. "cutlass_tensorop_h884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1307. "cutlass_tensorop_h884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1308. "cutlass_tensorop_h884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1309. "cutlass_tensorop_h884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1310. "cutlass_tensorop_h884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1311. "cutlass_tensorop_h884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1312. "cutlass_tensorop_h884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1313. "cutlass_tensorop_h884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1314. "cutlass_tensorop_h884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1315. "cutlass_tensorop_h884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1316. "cutlass_tensorop_h884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1317. "cutlass_tensorop_h884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1318. "cutlass_tensorop_h884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1319. "cutlass_tensorop_h884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1320. "cutlass_tensorop_h884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1321. "cutlass_tensorop_h884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1322. "cutlass_tensorop_h884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1323. "cutlass_tensorop_h884dwfprop_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1324. "cutlass_tensorop_h884dwfprop_relu_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1325. "cutlass_tensorop_h884dwfprop_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1326. "cutlass_tensorop_h884dwfprop_relu_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1327. "cutlass_tensorop_h884dwfprop_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1328. "cutlass_tensorop_h884dwfprop_relu_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1329. "cutlass_tensorop_h884dwfprop_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1330. "cutlass_tensorop_h884dwfprop_relu_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1331. "cutlass_tensorop_h884dwfprop_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1332. "cutlass_tensorop_h884dwfprop_relu_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1333. "all_dwconv2d_fprop_tensorop884_operations.cu",
  1334. "cutlass_simt_sdwdgrad_id_f32_32x32x8_32x32x8_2_nchw_nchw_align4x1.cu",
  1335. "cutlass_simt_sdwdgrad_id_f32_32x32x8_32x32x8_2_nchw_nchw_align1x1.cu",
  1336. "cutlass_simt_sdwdgrad_id_f32_32x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1337. "cutlass_simt_sdwdgrad_id_f32_32x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1338. "cutlass_simt_sdwdgrad_id_f32_64x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1339. "cutlass_simt_sdwdgrad_id_f32_64x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1340. "cutlass_simt_sdwdgrad_id_f32_32x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1341. "cutlass_simt_sdwdgrad_id_f32_32x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1342. "cutlass_simt_sdwdgrad_id_f32_64x64x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1343. "cutlass_simt_sdwdgrad_id_f32_64x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1344. "cutlass_simt_sdwdgrad_id_f32_128x32x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1345. "cutlass_simt_sdwdgrad_id_f32_128x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1346. "cutlass_simt_sdwdgrad_id_f32_64x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1347. "cutlass_simt_sdwdgrad_id_f32_64x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1348. "cutlass_simt_sdwdgrad_id_f32_128x64x8_64x32x8_2_nchw_nchw_align4x1.cu",
  1349. "cutlass_simt_sdwdgrad_id_f32_128x64x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1350. "cutlass_simt_sdwdgrad_id_f32_128x128x8_32x64x8_2_nchw_nchw_align4x1.cu",
  1351. "cutlass_simt_sdwdgrad_id_f32_128x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1352. "all_dwconv2d_dgrad_simt_operations.cu",
  1353. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1354. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1355. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1356. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1357. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1358. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1359. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1360. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1361. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1362. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1363. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1364. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1365. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1366. "cutlass_tensorop_f16_s884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1367. "cutlass_tensorop_f16_s884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1368. "cutlass_tensorop_h884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1369. "cutlass_tensorop_h884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1370. "cutlass_tensorop_h884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1371. "cutlass_tensorop_h884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1372. "cutlass_tensorop_h884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1373. "cutlass_tensorop_h884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1374. "cutlass_tensorop_h884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1375. "cutlass_tensorop_h884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1376. "cutlass_tensorop_h884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1377. "cutlass_tensorop_h884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1378. "cutlass_tensorop_h884dwdgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1379. "cutlass_tensorop_h884dwdgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1380. "cutlass_tensorop_h884dwdgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1381. "cutlass_tensorop_h884dwdgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1382. "cutlass_tensorop_h884dwdgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1383. "all_dwconv2d_dgrad_tensorop884_operations.cu",
  1384. "cutlass_simt_sdwwgrad_id_f32_32x32x8_32x32x8_2_nchw_nchw_align1x1.cu",
  1385. "cutlass_simt_sdwwgrad_id_f32_32x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1386. "cutlass_simt_sdwwgrad_id_f32_64x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1387. "cutlass_simt_sdwwgrad_id_f32_32x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1388. "cutlass_simt_sdwwgrad_id_f32_64x64x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1389. "cutlass_simt_sdwwgrad_id_f32_128x32x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1390. "cutlass_simt_sdwwgrad_id_f32_64x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1391. "cutlass_simt_sdwwgrad_id_f32_128x64x8_64x32x8_2_nchw_nchw_align1x1.cu",
  1392. "cutlass_simt_sdwwgrad_id_f32_128x128x8_32x64x8_2_nchw_nchw_align1x1.cu",
  1393. "all_dwconv2d_wgrad_simt_operations.cu",
  1394. "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x8.cu",
  1395. "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x8.cu",
  1396. "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x8.cu",
  1397. "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x8.cu",
  1398. "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x8.cu",
  1399. "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x2.cu",
  1400. "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x2.cu",
  1401. "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x2.cu",
  1402. "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x2.cu",
  1403. "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x2.cu",
  1404. "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align8x1.cu",
  1405. "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1406. "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1407. "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1408. "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align8x1.cu",
  1409. "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x8.cu",
  1410. "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x8.cu",
  1411. "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x8.cu",
  1412. "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x8.cu",
  1413. "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x8.cu",
  1414. "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x2.cu",
  1415. "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x2.cu",
  1416. "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x2.cu",
  1417. "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x2.cu",
  1418. "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x2.cu",
  1419. "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align2x1.cu",
  1420. "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1421. "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1422. "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1423. "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align2x1.cu",
  1424. "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x8.cu",
  1425. "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x8.cu",
  1426. "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x8.cu",
  1427. "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x8.cu",
  1428. "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x8.cu",
  1429. "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x2.cu",
  1430. "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x2.cu",
  1431. "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x2.cu",
  1432. "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x2.cu",
  1433. "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x2.cu",
  1434. "cutlass_tensorop_s884dwwgrad_id_f16_128x256x32_64x64x32_2_nchw_nchw_align1x1.cu",
  1435. "cutlass_tensorop_s884dwwgrad_id_f16_128x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1436. "cutlass_tensorop_s884dwwgrad_id_f16_64x128x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1437. "cutlass_tensorop_s884dwwgrad_id_f16_128x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1438. "cutlass_tensorop_s884dwwgrad_id_f16_64x64x32_32x32x32_2_nchw_nchw_align1x1.cu",
  1439. "all_dwconv2d_wgrad_tensorop884_operations.cu",
  1440. ]