You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

list.bzl 97 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224
  1. # Generated by dnn/scripts/cutlass_generator/gen_list.py
  2. cutlass_gen_list = [
  3. "cutlass_simt_sgemm_8x32_8x2_nn_align1.cu",
  4. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nn_align1.cu",
  5. "cutlass_simt_sgemm_16x32_8x2_nn_align1.cu",
  6. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nn_align1.cu",
  7. "cutlass_simt_sgemm_16x64_8x2_nn_align1.cu",
  8. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nn_align1.cu",
  9. "cutlass_simt_sgemm_32x32_8x2_nn_align1.cu",
  10. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nn_align1.cu",
  11. "cutlass_simt_sgemm_32x64_8x2_nn_align1.cu",
  12. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nn_align1.cu",
  13. "cutlass_simt_sgemm_64x32_8x2_nn_align1.cu",
  14. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nn_align1.cu",
  15. "cutlass_simt_sgemm_16x128_8x2_nn_align1.cu",
  16. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nn_align1.cu",
  17. "cutlass_simt_sgemm_32x128_8x2_nn_align1.cu",
  18. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nn_align1.cu",
  19. "cutlass_simt_sgemm_64x64_8x2_nn_align1.cu",
  20. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nn_align1.cu",
  21. "cutlass_simt_sgemm_128x32_8x2_nn_align1.cu",
  22. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nn_align1.cu",
  23. "cutlass_simt_sgemm_64x128_8x2_nn_align1.cu",
  24. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nn_align1.cu",
  25. "cutlass_simt_sgemm_128x64_8x2_nn_align1.cu",
  26. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nn_align1.cu",
  27. "cutlass_simt_sgemm_32x256_8x2_nn_align1.cu",
  28. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nn_align1.cu",
  29. "cutlass_simt_sgemm_64x256_8x2_nn_align1.cu",
  30. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nn_align1.cu",
  31. "cutlass_simt_sgemm_128x128_8x2_nn_align1.cu",
  32. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nn_align1.cu",
  33. "cutlass_simt_sgemm_256x32_8x2_nn_align1.cu",
  34. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nn_align1.cu",
  35. "cutlass_simt_sgemm_256x64_8x2_nn_align1.cu",
  36. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nn_align1.cu",
  37. "cutlass_simt_sgemm_8x32_8x2_nt_align1.cu",
  38. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_nt_align1.cu",
  39. "cutlass_simt_sgemm_16x32_8x2_nt_align1.cu",
  40. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_nt_align1.cu",
  41. "cutlass_simt_sgemm_16x64_8x2_nt_align1.cu",
  42. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_nt_align1.cu",
  43. "cutlass_simt_sgemm_32x32_8x2_nt_align1.cu",
  44. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_nt_align1.cu",
  45. "cutlass_simt_sgemm_32x64_8x2_nt_align1.cu",
  46. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_nt_align1.cu",
  47. "cutlass_simt_sgemm_64x32_8x2_nt_align1.cu",
  48. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_nt_align1.cu",
  49. "cutlass_simt_sgemm_16x128_8x2_nt_align1.cu",
  50. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_nt_align1.cu",
  51. "cutlass_simt_sgemm_32x128_8x2_nt_align1.cu",
  52. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_nt_align1.cu",
  53. "cutlass_simt_sgemm_64x64_8x2_nt_align1.cu",
  54. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_nt_align1.cu",
  55. "cutlass_simt_sgemm_128x32_8x2_nt_align1.cu",
  56. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_nt_align1.cu",
  57. "cutlass_simt_sgemm_64x128_8x2_nt_align1.cu",
  58. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_nt_align1.cu",
  59. "cutlass_simt_sgemm_128x64_8x2_nt_align1.cu",
  60. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_nt_align1.cu",
  61. "cutlass_simt_sgemm_32x256_8x2_nt_align1.cu",
  62. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_nt_align1.cu",
  63. "cutlass_simt_sgemm_64x256_8x2_nt_align1.cu",
  64. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_nt_align1.cu",
  65. "cutlass_simt_sgemm_128x128_8x2_nt_align1.cu",
  66. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_nt_align1.cu",
  67. "cutlass_simt_sgemm_256x32_8x2_nt_align1.cu",
  68. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_nt_align1.cu",
  69. "cutlass_simt_sgemm_256x64_8x2_nt_align1.cu",
  70. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_nt_align1.cu",
  71. "cutlass_simt_sgemm_8x32_8x2_tn_align1.cu",
  72. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tn_align1.cu",
  73. "cutlass_simt_sgemm_16x32_8x2_tn_align1.cu",
  74. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tn_align1.cu",
  75. "cutlass_simt_sgemm_16x64_8x2_tn_align1.cu",
  76. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tn_align1.cu",
  77. "cutlass_simt_sgemm_32x32_8x2_tn_align1.cu",
  78. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tn_align1.cu",
  79. "cutlass_simt_sgemm_32x64_8x2_tn_align1.cu",
  80. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tn_align1.cu",
  81. "cutlass_simt_sgemm_64x32_8x2_tn_align1.cu",
  82. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tn_align1.cu",
  83. "cutlass_simt_sgemm_16x128_8x2_tn_align1.cu",
  84. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tn_align1.cu",
  85. "cutlass_simt_sgemm_32x128_8x2_tn_align1.cu",
  86. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tn_align1.cu",
  87. "cutlass_simt_sgemm_64x64_8x2_tn_align1.cu",
  88. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tn_align1.cu",
  89. "cutlass_simt_sgemm_128x32_8x2_tn_align1.cu",
  90. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tn_align1.cu",
  91. "cutlass_simt_sgemm_64x128_8x2_tn_align1.cu",
  92. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tn_align1.cu",
  93. "cutlass_simt_sgemm_128x64_8x2_tn_align1.cu",
  94. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tn_align1.cu",
  95. "cutlass_simt_sgemm_32x256_8x2_tn_align1.cu",
  96. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tn_align1.cu",
  97. "cutlass_simt_sgemm_64x256_8x2_tn_align1.cu",
  98. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tn_align1.cu",
  99. "cutlass_simt_sgemm_128x128_8x2_tn_align1.cu",
  100. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tn_align1.cu",
  101. "cutlass_simt_sgemm_256x32_8x2_tn_align1.cu",
  102. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tn_align1.cu",
  103. "cutlass_simt_sgemm_256x64_8x2_tn_align1.cu",
  104. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tn_align1.cu",
  105. "cutlass_simt_sgemm_8x32_8x2_tt_align1.cu",
  106. "cutlass_simt_sgemm_split_k_parallel_8x32_8x2_tt_align1.cu",
  107. "cutlass_simt_sgemm_16x32_8x2_tt_align1.cu",
  108. "cutlass_simt_sgemm_split_k_parallel_16x32_8x2_tt_align1.cu",
  109. "cutlass_simt_sgemm_16x64_8x2_tt_align1.cu",
  110. "cutlass_simt_sgemm_split_k_parallel_16x64_8x2_tt_align1.cu",
  111. "cutlass_simt_sgemm_32x32_8x2_tt_align1.cu",
  112. "cutlass_simt_sgemm_split_k_parallel_32x32_8x2_tt_align1.cu",
  113. "cutlass_simt_sgemm_32x64_8x2_tt_align1.cu",
  114. "cutlass_simt_sgemm_split_k_parallel_32x64_8x2_tt_align1.cu",
  115. "cutlass_simt_sgemm_64x32_8x2_tt_align1.cu",
  116. "cutlass_simt_sgemm_split_k_parallel_64x32_8x2_tt_align1.cu",
  117. "cutlass_simt_sgemm_16x128_8x2_tt_align1.cu",
  118. "cutlass_simt_sgemm_split_k_parallel_16x128_8x2_tt_align1.cu",
  119. "cutlass_simt_sgemm_32x128_8x2_tt_align1.cu",
  120. "cutlass_simt_sgemm_split_k_parallel_32x128_8x2_tt_align1.cu",
  121. "cutlass_simt_sgemm_64x64_8x2_tt_align1.cu",
  122. "cutlass_simt_sgemm_split_k_parallel_64x64_8x2_tt_align1.cu",
  123. "cutlass_simt_sgemm_128x32_8x2_tt_align1.cu",
  124. "cutlass_simt_sgemm_split_k_parallel_128x32_8x2_tt_align1.cu",
  125. "cutlass_simt_sgemm_64x128_8x2_tt_align1.cu",
  126. "cutlass_simt_sgemm_split_k_parallel_64x128_8x2_tt_align1.cu",
  127. "cutlass_simt_sgemm_128x64_8x2_tt_align1.cu",
  128. "cutlass_simt_sgemm_split_k_parallel_128x64_8x2_tt_align1.cu",
  129. "cutlass_simt_sgemm_32x256_8x2_tt_align1.cu",
  130. "cutlass_simt_sgemm_split_k_parallel_32x256_8x2_tt_align1.cu",
  131. "cutlass_simt_sgemm_64x256_8x2_tt_align1.cu",
  132. "cutlass_simt_sgemm_split_k_parallel_64x256_8x2_tt_align1.cu",
  133. "cutlass_simt_sgemm_128x128_8x2_tt_align1.cu",
  134. "cutlass_simt_sgemm_split_k_parallel_128x128_8x2_tt_align1.cu",
  135. "cutlass_simt_sgemm_256x32_8x2_tt_align1.cu",
  136. "cutlass_simt_sgemm_split_k_parallel_256x32_8x2_tt_align1.cu",
  137. "cutlass_simt_sgemm_256x64_8x2_tt_align1.cu",
  138. "cutlass_simt_sgemm_split_k_parallel_256x64_8x2_tt_align1.cu",
  139. "all_gemm_simt_operations.cu",
  140. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nn_align8.cu",
  141. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nn_align8.cu",
  142. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nn_align8.cu",
  143. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nn_align8.cu",
  144. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nn_align8.cu",
  145. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nn_align8.cu",
  146. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nn_align4.cu",
  147. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nn_align4.cu",
  148. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nn_align4.cu",
  149. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nn_align4.cu",
  150. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nn_align4.cu",
  151. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nn_align4.cu",
  152. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nn_align2.cu",
  153. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nn_align2.cu",
  154. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nn_align2.cu",
  155. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nn_align2.cu",
  156. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nn_align2.cu",
  157. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nn_align2.cu",
  158. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nt_align8.cu",
  159. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nt_align8.cu",
  160. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nt_align8.cu",
  161. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nt_align8.cu",
  162. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nt_align8.cu",
  163. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nt_align8.cu",
  164. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nt_align4.cu",
  165. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nt_align4.cu",
  166. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nt_align4.cu",
  167. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nt_align4.cu",
  168. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nt_align4.cu",
  169. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nt_align4.cu",
  170. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_nt_align2.cu",
  171. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_nt_align2.cu",
  172. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_nt_align2.cu",
  173. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_nt_align2.cu",
  174. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_nt_align2.cu",
  175. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_nt_align2.cu",
  176. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tn_align8.cu",
  177. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tn_align8.cu",
  178. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tn_align8.cu",
  179. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tn_align8.cu",
  180. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tn_align8.cu",
  181. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tn_align8.cu",
  182. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tn_align4.cu",
  183. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tn_align4.cu",
  184. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tn_align4.cu",
  185. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tn_align4.cu",
  186. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tn_align4.cu",
  187. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tn_align4.cu",
  188. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tn_align2.cu",
  189. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tn_align2.cu",
  190. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tn_align2.cu",
  191. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tn_align2.cu",
  192. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tn_align2.cu",
  193. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tn_align2.cu",
  194. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tt_align8.cu",
  195. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tt_align8.cu",
  196. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tt_align8.cu",
  197. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tt_align8.cu",
  198. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tt_align8.cu",
  199. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tt_align8.cu",
  200. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tt_align4.cu",
  201. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tt_align4.cu",
  202. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tt_align4.cu",
  203. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tt_align4.cu",
  204. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tt_align4.cu",
  205. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tt_align4.cu",
  206. "cutlass_tensorop_f16_s1688gemm_f16_256x128_32x2_tt_align2.cu",
  207. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_256x128_32x2_tt_align2.cu",
  208. "cutlass_tensorop_f16_s1688gemm_f16_128x256_32x2_tt_align2.cu",
  209. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x256_32x2_tt_align2.cu",
  210. "cutlass_tensorop_f16_s1688gemm_f16_128x128_32x2_tt_align2.cu",
  211. "cutlass_tensorop_f16_s1688gemm_split_k_parallel_f16_128x128_32x2_tt_align2.cu",
  212. "cutlass_tensorop_h1688gemm_256x128_32x2_nn_align8.cu",
  213. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nn_align8.cu",
  214. "cutlass_tensorop_h1688gemm_128x256_32x2_nn_align8.cu",
  215. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nn_align8.cu",
  216. "cutlass_tensorop_h1688gemm_128x128_32x2_nn_align8.cu",
  217. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nn_align8.cu",
  218. "cutlass_tensorop_h1688gemm_256x128_32x2_nn_align4.cu",
  219. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nn_align4.cu",
  220. "cutlass_tensorop_h1688gemm_128x256_32x2_nn_align4.cu",
  221. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nn_align4.cu",
  222. "cutlass_tensorop_h1688gemm_128x128_32x2_nn_align4.cu",
  223. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nn_align4.cu",
  224. "cutlass_tensorop_h1688gemm_256x128_32x2_nn_align2.cu",
  225. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nn_align2.cu",
  226. "cutlass_tensorop_h1688gemm_128x256_32x2_nn_align2.cu",
  227. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nn_align2.cu",
  228. "cutlass_tensorop_h1688gemm_128x128_32x2_nn_align2.cu",
  229. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nn_align2.cu",
  230. "cutlass_tensorop_h1688gemm_256x128_32x2_nt_align8.cu",
  231. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nt_align8.cu",
  232. "cutlass_tensorop_h1688gemm_128x256_32x2_nt_align8.cu",
  233. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nt_align8.cu",
  234. "cutlass_tensorop_h1688gemm_128x128_32x2_nt_align8.cu",
  235. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nt_align8.cu",
  236. "cutlass_tensorop_h1688gemm_256x128_32x2_nt_align4.cu",
  237. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nt_align4.cu",
  238. "cutlass_tensorop_h1688gemm_128x256_32x2_nt_align4.cu",
  239. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nt_align4.cu",
  240. "cutlass_tensorop_h1688gemm_128x128_32x2_nt_align4.cu",
  241. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nt_align4.cu",
  242. "cutlass_tensorop_h1688gemm_256x128_32x2_nt_align2.cu",
  243. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_nt_align2.cu",
  244. "cutlass_tensorop_h1688gemm_128x256_32x2_nt_align2.cu",
  245. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_nt_align2.cu",
  246. "cutlass_tensorop_h1688gemm_128x128_32x2_nt_align2.cu",
  247. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_nt_align2.cu",
  248. "cutlass_tensorop_h1688gemm_256x128_32x2_tn_align8.cu",
  249. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tn_align8.cu",
  250. "cutlass_tensorop_h1688gemm_128x256_32x2_tn_align8.cu",
  251. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tn_align8.cu",
  252. "cutlass_tensorop_h1688gemm_128x128_32x2_tn_align8.cu",
  253. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tn_align8.cu",
  254. "cutlass_tensorop_h1688gemm_256x128_32x2_tn_align4.cu",
  255. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tn_align4.cu",
  256. "cutlass_tensorop_h1688gemm_128x256_32x2_tn_align4.cu",
  257. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tn_align4.cu",
  258. "cutlass_tensorop_h1688gemm_128x128_32x2_tn_align4.cu",
  259. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tn_align4.cu",
  260. "cutlass_tensorop_h1688gemm_256x128_32x2_tn_align2.cu",
  261. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tn_align2.cu",
  262. "cutlass_tensorop_h1688gemm_128x256_32x2_tn_align2.cu",
  263. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tn_align2.cu",
  264. "cutlass_tensorop_h1688gemm_128x128_32x2_tn_align2.cu",
  265. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tn_align2.cu",
  266. "cutlass_tensorop_h1688gemm_256x128_32x2_tt_align8.cu",
  267. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tt_align8.cu",
  268. "cutlass_tensorop_h1688gemm_128x256_32x2_tt_align8.cu",
  269. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tt_align8.cu",
  270. "cutlass_tensorop_h1688gemm_128x128_32x2_tt_align8.cu",
  271. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tt_align8.cu",
  272. "cutlass_tensorop_h1688gemm_256x128_32x2_tt_align4.cu",
  273. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tt_align4.cu",
  274. "cutlass_tensorop_h1688gemm_128x256_32x2_tt_align4.cu",
  275. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tt_align4.cu",
  276. "cutlass_tensorop_h1688gemm_128x128_32x2_tt_align4.cu",
  277. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tt_align4.cu",
  278. "cutlass_tensorop_h1688gemm_256x128_32x2_tt_align2.cu",
  279. "cutlass_tensorop_h1688gemm_split_k_parallel_256x128_32x2_tt_align2.cu",
  280. "cutlass_tensorop_h1688gemm_128x256_32x2_tt_align2.cu",
  281. "cutlass_tensorop_h1688gemm_split_k_parallel_128x256_32x2_tt_align2.cu",
  282. "cutlass_tensorop_h1688gemm_128x128_32x2_tt_align2.cu",
  283. "cutlass_tensorop_h1688gemm_split_k_parallel_128x128_32x2_tt_align2.cu",
  284. "all_gemm_tensorop1688_operations.cu",
  285. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nn_align8.cu",
  286. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nn_align8.cu",
  287. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nn_align8.cu",
  288. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nn_align8.cu",
  289. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nn_align8.cu",
  290. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nn_align8.cu",
  291. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nn_align4.cu",
  292. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nn_align4.cu",
  293. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nn_align4.cu",
  294. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nn_align4.cu",
  295. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nn_align4.cu",
  296. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nn_align4.cu",
  297. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nn_align2.cu",
  298. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nn_align2.cu",
  299. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nn_align2.cu",
  300. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nn_align2.cu",
  301. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nn_align2.cu",
  302. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nn_align2.cu",
  303. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nt_align8.cu",
  304. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nt_align8.cu",
  305. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nt_align8.cu",
  306. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nt_align8.cu",
  307. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nt_align8.cu",
  308. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nt_align8.cu",
  309. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nt_align4.cu",
  310. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nt_align4.cu",
  311. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nt_align4.cu",
  312. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nt_align4.cu",
  313. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nt_align4.cu",
  314. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nt_align4.cu",
  315. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_nt_align2.cu",
  316. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_nt_align2.cu",
  317. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_nt_align2.cu",
  318. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_nt_align2.cu",
  319. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_nt_align2.cu",
  320. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_nt_align2.cu",
  321. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tn_align8.cu",
  322. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tn_align8.cu",
  323. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tn_align8.cu",
  324. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tn_align8.cu",
  325. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tn_align8.cu",
  326. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tn_align8.cu",
  327. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tn_align4.cu",
  328. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tn_align4.cu",
  329. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tn_align4.cu",
  330. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tn_align4.cu",
  331. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tn_align4.cu",
  332. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tn_align4.cu",
  333. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tn_align2.cu",
  334. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tn_align2.cu",
  335. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tn_align2.cu",
  336. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tn_align2.cu",
  337. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tn_align2.cu",
  338. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tn_align2.cu",
  339. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tt_align8.cu",
  340. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tt_align8.cu",
  341. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tt_align8.cu",
  342. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tt_align8.cu",
  343. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tt_align8.cu",
  344. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tt_align8.cu",
  345. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tt_align4.cu",
  346. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tt_align4.cu",
  347. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tt_align4.cu",
  348. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tt_align4.cu",
  349. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tt_align4.cu",
  350. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tt_align4.cu",
  351. "cutlass_tensorop_f16_s884gemm_f16_256x128_32x2_tt_align2.cu",
  352. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_256x128_32x2_tt_align2.cu",
  353. "cutlass_tensorop_f16_s884gemm_f16_128x256_32x2_tt_align2.cu",
  354. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x256_32x2_tt_align2.cu",
  355. "cutlass_tensorop_f16_s884gemm_f16_128x128_32x2_tt_align2.cu",
  356. "cutlass_tensorop_f16_s884gemm_split_k_parallel_f16_128x128_32x2_tt_align2.cu",
  357. "cutlass_tensorop_h884gemm_256x128_32x2_nn_align8.cu",
  358. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nn_align8.cu",
  359. "cutlass_tensorop_h884gemm_128x256_32x2_nn_align8.cu",
  360. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nn_align8.cu",
  361. "cutlass_tensorop_h884gemm_128x128_32x2_nn_align8.cu",
  362. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nn_align8.cu",
  363. "cutlass_tensorop_h884gemm_256x128_32x2_nn_align4.cu",
  364. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nn_align4.cu",
  365. "cutlass_tensorop_h884gemm_128x256_32x2_nn_align4.cu",
  366. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nn_align4.cu",
  367. "cutlass_tensorop_h884gemm_128x128_32x2_nn_align4.cu",
  368. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nn_align4.cu",
  369. "cutlass_tensorop_h884gemm_256x128_32x2_nn_align2.cu",
  370. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nn_align2.cu",
  371. "cutlass_tensorop_h884gemm_128x256_32x2_nn_align2.cu",
  372. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nn_align2.cu",
  373. "cutlass_tensorop_h884gemm_128x128_32x2_nn_align2.cu",
  374. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nn_align2.cu",
  375. "cutlass_tensorop_h884gemm_256x128_32x2_nt_align8.cu",
  376. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nt_align8.cu",
  377. "cutlass_tensorop_h884gemm_128x256_32x2_nt_align8.cu",
  378. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nt_align8.cu",
  379. "cutlass_tensorop_h884gemm_128x128_32x2_nt_align8.cu",
  380. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nt_align8.cu",
  381. "cutlass_tensorop_h884gemm_256x128_32x2_nt_align4.cu",
  382. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nt_align4.cu",
  383. "cutlass_tensorop_h884gemm_128x256_32x2_nt_align4.cu",
  384. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nt_align4.cu",
  385. "cutlass_tensorop_h884gemm_128x128_32x2_nt_align4.cu",
  386. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nt_align4.cu",
  387. "cutlass_tensorop_h884gemm_256x128_32x2_nt_align2.cu",
  388. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_nt_align2.cu",
  389. "cutlass_tensorop_h884gemm_128x256_32x2_nt_align2.cu",
  390. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_nt_align2.cu",
  391. "cutlass_tensorop_h884gemm_128x128_32x2_nt_align2.cu",
  392. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_nt_align2.cu",
  393. "cutlass_tensorop_h884gemm_256x128_32x2_tn_align8.cu",
  394. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tn_align8.cu",
  395. "cutlass_tensorop_h884gemm_128x256_32x2_tn_align8.cu",
  396. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tn_align8.cu",
  397. "cutlass_tensorop_h884gemm_128x128_32x2_tn_align8.cu",
  398. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tn_align8.cu",
  399. "cutlass_tensorop_h884gemm_256x128_32x2_tn_align4.cu",
  400. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tn_align4.cu",
  401. "cutlass_tensorop_h884gemm_128x256_32x2_tn_align4.cu",
  402. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tn_align4.cu",
  403. "cutlass_tensorop_h884gemm_128x128_32x2_tn_align4.cu",
  404. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tn_align4.cu",
  405. "cutlass_tensorop_h884gemm_256x128_32x2_tn_align2.cu",
  406. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tn_align2.cu",
  407. "cutlass_tensorop_h884gemm_128x256_32x2_tn_align2.cu",
  408. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tn_align2.cu",
  409. "cutlass_tensorop_h884gemm_128x128_32x2_tn_align2.cu",
  410. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tn_align2.cu",
  411. "cutlass_tensorop_h884gemm_256x128_32x2_tt_align8.cu",
  412. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tt_align8.cu",
  413. "cutlass_tensorop_h884gemm_128x256_32x2_tt_align8.cu",
  414. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tt_align8.cu",
  415. "cutlass_tensorop_h884gemm_128x128_32x2_tt_align8.cu",
  416. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tt_align8.cu",
  417. "cutlass_tensorop_h884gemm_256x128_32x2_tt_align4.cu",
  418. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tt_align4.cu",
  419. "cutlass_tensorop_h884gemm_128x256_32x2_tt_align4.cu",
  420. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tt_align4.cu",
  421. "cutlass_tensorop_h884gemm_128x128_32x2_tt_align4.cu",
  422. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tt_align4.cu",
  423. "cutlass_tensorop_h884gemm_256x128_32x2_tt_align2.cu",
  424. "cutlass_tensorop_h884gemm_split_k_parallel_256x128_32x2_tt_align2.cu",
  425. "cutlass_tensorop_h884gemm_128x256_32x2_tt_align2.cu",
  426. "cutlass_tensorop_h884gemm_split_k_parallel_128x256_32x2_tt_align2.cu",
  427. "cutlass_tensorop_h884gemm_128x128_32x2_tt_align2.cu",
  428. "cutlass_tensorop_h884gemm_split_k_parallel_128x128_32x2_tt_align2.cu",
  429. "all_gemm_tensorop884_operations.cu",
  430. "cutlass_simt_sgemv_batched_strided_1x128_32_tt_align4x4.cu",
  431. "cutlass_simt_sgemv_batched_strided_1x128_16_tt_align4x2.cu",
  432. "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align4x1.cu",
  433. "cutlass_simt_sgemv_batched_strided_1x128_16_tt_align2x4.cu",
  434. "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align2x2.cu",
  435. "cutlass_simt_sgemv_batched_strided_1x128_4_tt_align2x1.cu",
  436. "cutlass_simt_sgemv_batched_strided_1x128_8_tt_align1x4.cu",
  437. "cutlass_simt_sgemv_batched_strided_1x128_4_tt_align1x2.cu",
  438. "cutlass_simt_sgemv_batched_strided_1x128_2_tt_align1x1.cu",
  439. "cutlass_simt_sgemv_batched_strided_1x64_64_tt_align4x4.cu",
  440. "cutlass_simt_sgemv_batched_strided_1x64_32_tt_align4x2.cu",
  441. "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align4x1.cu",
  442. "cutlass_simt_sgemv_batched_strided_1x64_32_tt_align2x4.cu",
  443. "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align2x2.cu",
  444. "cutlass_simt_sgemv_batched_strided_1x64_8_tt_align2x1.cu",
  445. "cutlass_simt_sgemv_batched_strided_1x64_16_tt_align1x4.cu",
  446. "cutlass_simt_sgemv_batched_strided_1x64_8_tt_align1x2.cu",
  447. "cutlass_simt_sgemv_batched_strided_1x64_4_tt_align1x1.cu",
  448. "cutlass_simt_sgemv_batched_strided_1x32_128_tt_align4x4.cu",
  449. "cutlass_simt_sgemv_batched_strided_1x32_64_tt_align4x2.cu",
  450. "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align4x1.cu",
  451. "cutlass_simt_sgemv_batched_strided_1x32_64_tt_align2x4.cu",
  452. "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align2x2.cu",
  453. "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align2x1.cu",
  454. "cutlass_simt_sgemv_batched_strided_1x32_32_tt_align1x4.cu",
  455. "cutlass_simt_sgemv_batched_strided_1x32_16_tt_align1x2.cu",
  456. "cutlass_simt_sgemv_batched_strided_1x32_8_tt_align1x1.cu",
  457. "cutlass_simt_s8_idgrad_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu",
  458. "cutlass_simt_s8_idgrad_s2_id_s8_32x128x32_32x64x32_2_nc4hw4_k4rsc4.cu",
  459. "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu",
  460. "cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x64x16_2_nc4hw4_k4rsc4.cu",
  461. "cutlass_simt_s8_idgrad_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu",
  462. "cutlass_simt_s8_idgrad_s2_id_s8_16x128x16_16x128x16_1_nc4hw4_k4rsc4.cu",
  463. "cutlass_simt_s8_idgrad_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu",
  464. "cutlass_simt_s8_idgrad_s2_id_s8_16x64x8_16x64x8_2_nc4hw4_k4rsc4.cu",
  465. "all_deconv_simt_operations.cu",
  466. "cutlass_simt_s8_ifprop_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  467. "cutlass_simt_s8_ifprop_1x1_id_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  468. "cutlass_simt_s8_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  469. "cutlass_simt_s8_ifprop_1x1_relu_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  470. "cutlass_simt_s8_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  471. "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  472. "cutlass_simt_s8_ifprop_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  473. "cutlass_simt_s8_ifprop_1x1_id_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  474. "cutlass_simt_s8_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  475. "cutlass_simt_s8_ifprop_1x1_relu_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  476. "cutlass_simt_s8_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  477. "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x64x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  478. "cutlass_simt_s8_ifprop_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  479. "cutlass_simt_s8_ifprop_1x1_id_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  480. "cutlass_simt_s8_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  481. "cutlass_simt_s8_ifprop_1x1_relu_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  482. "cutlass_simt_s8_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  483. "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x128x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  484. "cutlass_simt_s8_ifprop_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  485. "cutlass_simt_s8_ifprop_1x1_id_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  486. "cutlass_simt_s8_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  487. "cutlass_simt_s8_ifprop_1x1_relu_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  488. "cutlass_simt_s8_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  489. "cutlass_simt_s8_ifprop_1x1_hswish_s8_128x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  490. "cutlass_simt_s8_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  491. "cutlass_simt_s8_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  492. "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  493. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  494. "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  495. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  496. "cutlass_simt_s8_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  497. "cutlass_simt_s8_ifprop_1x1_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  498. "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  499. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  500. "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  501. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4.cu",
  502. "cutlass_simt_s8_ifprop_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  503. "cutlass_simt_s8_ifprop_1x1_id_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  504. "cutlass_simt_s8_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  505. "cutlass_simt_s8_ifprop_1x1_relu_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  506. "cutlass_simt_s8_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  507. "cutlass_simt_s8_ifprop_1x1_hswish_s8_64x32x32_64x32x32_2_nc4hw4_c4rsk4.cu",
  508. "cutlass_simt_s8_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
  509. "cutlass_simt_s8_ifprop_1x1_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
  510. "cutlass_simt_s8_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
  511. "cutlass_simt_s8_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
  512. "cutlass_simt_s8_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
  513. "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4.cu",
  514. "cutlass_simt_s8_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
  515. "cutlass_simt_s8_ifprop_1x1_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
  516. "cutlass_simt_s8_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
  517. "cutlass_simt_s8_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
  518. "cutlass_simt_s8_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
  519. "cutlass_simt_s8_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4.cu",
  520. "cutlass_simt_s8_ifprop_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  521. "cutlass_simt_s8_ifprop_1x1_id_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  522. "cutlass_simt_s8_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  523. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  524. "cutlass_simt_s8_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  525. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x128x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  526. "cutlass_simt_s8_ifprop_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  527. "cutlass_simt_s8_ifprop_1x1_id_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  528. "cutlass_simt_s8_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  529. "cutlass_simt_s8_ifprop_1x1_relu_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  530. "cutlass_simt_s8_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  531. "cutlass_simt_s8_ifprop_1x1_hswish_s8_32x64x32_32x64x32_2_nc4hw4_c4rsk4_nc32hw32.cu",
  532. "cutlass_simt_u4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
  533. "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
  534. "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
  535. "cutlass_simt_u4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
  536. "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
  537. "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
  538. "cutlass_simt_s4_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
  539. "cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
  540. "cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nhwc.cu",
  541. "cutlass_simt_s4_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
  542. "cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
  543. "cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nhwc.cu",
  544. "cutlass_simt_f32_ifprop_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
  545. "cutlass_simt_f32_ifprop_1x1_id_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
  546. "cutlass_simt_f32_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
  547. "cutlass_simt_f32_ifprop_1x1_relu_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
  548. "cutlass_simt_f32_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
  549. "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x128x16_16x128x16_1_nc4hw4_c4rsk4_nchw.cu",
  550. "cutlass_simt_f32_ifprop_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
  551. "cutlass_simt_f32_ifprop_1x1_id_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
  552. "cutlass_simt_f32_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
  553. "cutlass_simt_f32_ifprop_1x1_relu_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
  554. "cutlass_simt_f32_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
  555. "cutlass_simt_f32_ifprop_1x1_hswish_s8_16x64x8_16x64x8_2_nc4hw4_c4rsk4_nchw.cu",
  556. "all_conv2d_simt_operations.cu",
  557. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  558. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  559. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  560. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  561. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  562. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x256x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  563. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  564. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  565. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  566. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  567. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  568. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_256x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  569. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  570. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  571. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  572. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  573. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  574. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x128x64_64x64x64_2_nc32hw32_c32rsk32.cu",
  575. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
  576. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
  577. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
  578. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
  579. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
  580. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x64x64_64x32x64_2_nc32hw32_c32rsk32.cu",
  581. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
  582. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
  583. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
  584. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
  585. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
  586. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32.cu",
  587. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  588. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  589. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  590. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  591. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  592. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x64x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  593. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  594. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  595. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  596. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  597. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  598. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nc32hw32_c32rsk32.cu",
  599. "cutlass_tensorop_s8_i8816fprop_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
  600. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
  601. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
  602. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
  603. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
  604. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x128x64_32x64x64_2_nc32hw32_c32rsk32_nc4hw4.cu",
  605. "cutlass_tensorop_s8_i8816fprop_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
  606. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
  607. "cutlass_tensorop_s8_i8816fprop_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
  608. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
  609. "cutlass_tensorop_s8_i8816fprop_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
  610. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_32x128x32_32x64x32_1_nc32hw32_c32rsk32_nc4hw4.cu",
  611. "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  612. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  613. "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  614. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  615. "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  616. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  617. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  618. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  619. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  620. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  621. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  622. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  623. "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  624. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  625. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  626. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  627. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  628. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  629. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  630. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  631. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  632. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  633. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  634. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  635. "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  636. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  637. "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  638. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  639. "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  640. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  641. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  642. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  643. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  644. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  645. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  646. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  647. "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  648. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  649. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  650. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  651. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  652. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  653. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  654. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  655. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  656. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  657. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  658. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  659. "cutlass_tensorop_s8_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  660. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  661. "cutlass_tensorop_s8_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  662. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  663. "cutlass_tensorop_s8_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  664. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  665. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  666. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  667. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  668. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  669. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  670. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  671. "cutlass_tensorop_s8_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  672. "cutlass_tensorop_s8_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  673. "cutlass_tensorop_s8_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  674. "cutlass_tensorop_s8_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  675. "cutlass_tensorop_s8_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  676. "cutlass_tensorop_s8_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  677. "cutlass_tensorop_s8_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  678. "cutlass_tensorop_s8_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  679. "cutlass_tensorop_s8_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  680. "cutlass_tensorop_s8_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  681. "cutlass_tensorop_s8_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  682. "cutlass_tensorop_s8_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  683. "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  684. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  685. "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  686. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  687. "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  688. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  689. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  690. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  691. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  692. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  693. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  694. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  695. "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  696. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  697. "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  698. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  699. "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  700. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  701. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  702. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  703. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  704. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  705. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  706. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  707. "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  708. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  709. "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  710. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  711. "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  712. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  713. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  714. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  715. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  716. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  717. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  718. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  719. "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  720. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  721. "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  722. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  723. "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  724. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  725. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  726. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  727. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  728. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  729. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  730. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  731. "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  732. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  733. "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  734. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  735. "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  736. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc4hw4.cu",
  737. "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  738. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  739. "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  740. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  741. "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  742. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  743. "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  744. "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  745. "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  746. "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  747. "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  748. "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc4hw4.cu",
  749. "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  750. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  751. "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  752. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  753. "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  754. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  755. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  756. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  757. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  758. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  759. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  760. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  761. "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  762. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  763. "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  764. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  765. "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  766. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  767. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  768. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  769. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  770. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  771. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  772. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  773. "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  774. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  775. "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  776. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  777. "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  778. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  779. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  780. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  781. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  782. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  783. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  784. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  785. "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  786. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  787. "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  788. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  789. "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  790. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  791. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  792. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  793. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  794. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  795. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  796. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  797. "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  798. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  799. "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  800. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  801. "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  802. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc8hw8.cu",
  803. "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  804. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  805. "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  806. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  807. "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  808. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  809. "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  810. "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  811. "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  812. "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  813. "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  814. "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc8hw8.cu",
  815. "cutlass_tensorop_s4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  816. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  817. "cutlass_tensorop_s4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  818. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  819. "cutlass_tensorop_s4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  820. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  821. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  822. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  823. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  824. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  825. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  826. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  827. "cutlass_tensorop_s4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  828. "cutlass_tensorop_s4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  829. "cutlass_tensorop_s4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  830. "cutlass_tensorop_s4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  831. "cutlass_tensorop_s4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  832. "cutlass_tensorop_s4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  833. "cutlass_tensorop_s4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  834. "cutlass_tensorop_s4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  835. "cutlass_tensorop_s4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  836. "cutlass_tensorop_s4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  837. "cutlass_tensorop_s4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  838. "cutlass_tensorop_s4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  839. "cutlass_tensorop_u4_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  840. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  841. "cutlass_tensorop_u4_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  842. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  843. "cutlass_tensorop_u4_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  844. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  845. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  846. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  847. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  848. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  849. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  850. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  851. "cutlass_tensorop_u4_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  852. "cutlass_tensorop_u4_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  853. "cutlass_tensorop_u4_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  854. "cutlass_tensorop_u4_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  855. "cutlass_tensorop_u4_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  856. "cutlass_tensorop_u4_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  857. "cutlass_tensorop_u4_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  858. "cutlass_tensorop_u4_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  859. "cutlass_tensorop_u4_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  860. "cutlass_tensorop_u4_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  861. "cutlass_tensorop_u4_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  862. "cutlass_tensorop_u4_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  863. "cutlass_tensorop_f32_i8816fprop_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  864. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  865. "cutlass_tensorop_f32_i8816fprop_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  866. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  867. "cutlass_tensorop_f32_i8816fprop_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  868. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_128x32x32_64x32x32_1_nhwc_nc16hw16.cu",
  869. "cutlass_tensorop_f32_i8816fprop_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  870. "cutlass_tensorop_f32_i8816fprop_1x1_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  871. "cutlass_tensorop_f32_i8816fprop_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  872. "cutlass_tensorop_f32_i8816fprop_1x1_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  873. "cutlass_tensorop_f32_i8816fprop_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  874. "cutlass_tensorop_f32_i8816fprop_1x1_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  875. "cutlass_tensorop_f32_i8816fprop_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  876. "cutlass_tensorop_f32_i8816fprop_1x1_roc_id_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  877. "cutlass_tensorop_f32_i8816fprop_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  878. "cutlass_tensorop_f32_i8816fprop_1x1_roc_relu_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  879. "cutlass_tensorop_f32_i8816fprop_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  880. "cutlass_tensorop_f32_i8816fprop_1x1_roc_hswish_s8_64x16x32_64x16x32_2_nhwc_nc16hw16.cu",
  881. "all_conv2d_tensorop8816_operations.cu",
  882. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  883. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  884. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  885. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  886. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  887. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  888. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  889. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  890. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  891. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  892. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  893. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  894. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  895. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  896. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  897. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  898. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  899. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  900. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
  901. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
  902. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
  903. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
  904. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
  905. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
  906. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  907. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  908. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  909. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x256x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  910. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  911. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  912. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  913. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  914. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  915. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  916. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  917. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x128_64x64x128_2_nc64hw64_c64rsk64.cu",
  918. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
  919. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
  920. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
  921. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nc64hw64_c64rsk64.cu",
  922. "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  923. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  924. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  925. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  926. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  927. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  928. "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  929. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  930. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  931. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  932. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  933. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  934. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  935. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  936. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  937. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  938. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  939. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  940. "cutlass_tensorop_s4_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  941. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  942. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  943. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  944. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  945. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  946. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  947. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  948. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  949. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  950. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  951. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  952. "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  953. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  954. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  955. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  956. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  957. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  958. "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  959. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  960. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  961. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  962. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  963. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  964. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  965. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  966. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  967. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  968. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  969. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  970. "cutlass_tensorop_s4_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  971. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  972. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  973. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  974. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  975. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  976. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  977. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  978. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  979. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  980. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  981. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  982. "cutlass_tensorop_s4_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  983. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  984. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  985. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  986. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  987. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  988. "cutlass_tensorop_s4_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  989. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  990. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  991. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  992. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  993. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  994. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  995. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  996. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  997. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  998. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  999. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1000. "cutlass_tensorop_s4_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1001. "cutlass_tensorop_s4_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1002. "cutlass_tensorop_s4_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1003. "cutlass_tensorop_s4_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1004. "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1005. "cutlass_tensorop_s4_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1006. "cutlass_tensorop_s4_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1007. "cutlass_tensorop_s4_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1008. "cutlass_tensorop_s4_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1009. "cutlass_tensorop_s4_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1010. "cutlass_tensorop_s4_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1011. "cutlass_tensorop_s4_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1012. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1013. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1014. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1015. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1016. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1017. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1018. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1019. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1020. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1021. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1022. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1023. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1024. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1025. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1026. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1027. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1028. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1029. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1030. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1031. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1032. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1033. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1034. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1035. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1036. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1037. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1038. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1039. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1040. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1041. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1042. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1043. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1044. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1045. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1046. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1047. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1048. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1049. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1050. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1051. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1052. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1053. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1054. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1055. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1056. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1057. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1058. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1059. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1060. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1061. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1062. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1063. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1064. "cutlass_tensorop_u4_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1065. "cutlass_tensorop_u4_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1066. "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1067. "cutlass_tensorop_u4_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1068. "cutlass_tensorop_u4_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1069. "cutlass_tensorop_u4_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1070. "cutlass_tensorop_u4_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1071. "cutlass_tensorop_u4_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1072. "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1073. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1074. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1075. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1076. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1077. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1078. "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1079. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1080. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1081. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1082. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1083. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1084. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1085. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1086. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1087. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1088. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1089. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1090. "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1091. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1092. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1093. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1094. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1095. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1096. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1097. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1098. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1099. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1100. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1101. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1102. "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1103. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1104. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1105. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1106. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1107. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1108. "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1109. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1110. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1111. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1112. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1113. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1114. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1115. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1116. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1117. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1118. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1119. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1120. "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1121. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1122. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1123. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1124. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1125. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1126. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1127. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1128. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1129. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1130. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1131. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1132. "cutlass_tensorop_s8_i8832fprop_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1133. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1134. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1135. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1136. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1137. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1138. "cutlass_tensorop_s8_i8832fprop_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1139. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1140. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1141. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1142. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1143. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1144. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1145. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1146. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1147. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1148. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1149. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1150. "cutlass_tensorop_s8_i8832fprop_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1151. "cutlass_tensorop_s8_i8832fprop_1x1_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1152. "cutlass_tensorop_s8_i8832fprop_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1153. "cutlass_tensorop_s8_i8832fprop_1x1_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1154. "cutlass_tensorop_s8_i8832fprop_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1155. "cutlass_tensorop_s8_i8832fprop_1x1_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1156. "cutlass_tensorop_s8_i8832fprop_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1157. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1158. "cutlass_tensorop_s8_i8832fprop_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1159. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1160. "cutlass_tensorop_s8_i8832fprop_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1161. "cutlass_tensorop_s8_i8832fprop_1x1_roc_hswish_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1162. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1163. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1164. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1165. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc8hw8.cu",
  1166. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1167. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1168. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1169. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1170. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1171. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1172. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1173. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc8hw8.cu",
  1174. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1175. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1176. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1177. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1178. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1179. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1180. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1181. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc8hw8.cu",
  1182. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1183. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1184. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1185. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc16hw16.cu",
  1186. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1187. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1188. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1189. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1190. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1191. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1192. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1193. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc16hw16.cu",
  1194. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1195. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1196. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1197. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1198. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1199. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1200. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1201. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc16hw16.cu",
  1202. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1203. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1204. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1205. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x16x64_128x16x64_2_nhwc_nc32hw32.cu",
  1206. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1207. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1208. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1209. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1210. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1211. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1212. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1213. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x32x64_64x32x64_1_nhwc_nc32hw32.cu",
  1214. "cutlass_tensorop_s8_i8832fprop_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1215. "cutlass_tensorop_s8_i8832fprop_1x1_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1216. "cutlass_tensorop_s8_i8832fprop_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1217. "cutlass_tensorop_s8_i8832fprop_1x1_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1218. "cutlass_tensorop_s8_i8832fprop_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1219. "cutlass_tensorop_s8_i8832fprop_1x1_roc_id_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1220. "cutlass_tensorop_s8_i8832fprop_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1221. "cutlass_tensorop_s8_i8832fprop_1x1_roc_relu_u4_s4_128x64x64_64x64x64_1_nhwc_nc32hw32.cu",
  1222. "all_conv2d_tensorop8832_operations.cu",
  1223. ]

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台