You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_int8.cpp 52 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179
  1. #include "megdnn/oprs/nn.h"
  2. #include "src/common/utils.h"
  3. #include "src/cuda/cudnn_with_check.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/conv_bias.h"
  6. #include "test/common/tensor.h"
  7. #include "test/common/workspace_wrapper.h"
  8. #include "test/cuda/benchmark.h"
  9. #include "test/cuda/conv_test_utils.h"
  10. #include "test/cuda/fixture.h"
  11. #include "test/cuda/utils.h"
  12. namespace megdnn {
  13. namespace test {
  14. namespace conv {
  15. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CUDNN_CONVOLUTION) {
  16. require_compute_capability(7, 5);
  17. conv_bias::check_conv_bias(
  18. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  19. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  20. "DEFAULT:CUDNN:ConvBiasActivation:", param::ConvBias::Format::NCHW4);
  21. }
  22. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) {
  23. require_compute_capability(6, 1);
  24. conv_bias::check_conv_bias(
  25. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  26. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  27. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  28. conv_bias::get_int8_nchw4_args(1));
  29. }
  30. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_3x3) {
  31. require_compute_capability(6, 1);
  32. conv_bias::check_conv_bias(
  33. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  34. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  35. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4);
  36. }
  37. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_5x5) {
  38. require_compute_capability(6, 1);
  39. conv_bias::check_conv_bias(
  40. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  41. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  42. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  43. conv_bias::get_int8_nchw4_args(5));
  44. }
  45. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_7x7) {
  46. require_compute_capability(6, 1);
  47. conv_bias::check_conv_bias(
  48. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  49. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  50. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  51. conv_bias::get_int8_nchw4_args(7));
  52. }
  53. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_WITH_Z) {
  54. require_compute_capability(6, 1);
  55. Checker<ConvBiasForward> checker(handle_cuda());
  56. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  57. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  58. UniformIntRNG rng{-3, 3};
  59. UniformIntRNG bias_rng{-50, 50};
  60. checker.set_rng(0, &rng)
  61. .set_rng(1, &rng)
  62. .set_rng(2, &bias_rng)
  63. .set_rng(3, &rng)
  64. .set_dtype(0, dtype::QuantizedS8{1.2f})
  65. .set_dtype(1, dtype::QuantizedS8{1.3f})
  66. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  67. .set_dtype(3, dtype::QuantizedS8{1.1f})
  68. .set_dtype(4, dtype::QuantizedS8{1.0f})
  69. .set_epsilon(1 + 1e-3)
  70. .set_max_avg_error(1e-1)
  71. .set_max_avg_biased_error(1e-1);
  72. param::ConvBias param;
  73. param.pad_h = param.pad_w = 1;
  74. param.stride_h = param.stride_w = 1;
  75. param.format = param::ConvBias::Format::NCHW4;
  76. checker.set_param(param).execs(
  77. {{32, 4, 12, 12, 4},
  78. {16, 4, 3, 3, 4},
  79. {1, 4, 1, 1, 4},
  80. {32, 4, 12, 12, 4},
  81. {}});
  82. }
  83. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_STRIDE2_WITH_Z) {
  84. require_compute_capability(6, 1);
  85. Checker<ConvBiasForward> checker(handle_cuda());
  86. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  87. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  88. UniformIntRNG rng{-3, 3};
  89. UniformIntRNG bias_rng{-50, 50};
  90. checker.set_rng(0, &rng)
  91. .set_rng(1, &rng)
  92. .set_rng(2, &bias_rng)
  93. .set_rng(3, &rng)
  94. .set_dtype(0, dtype::QuantizedS8{1.2f})
  95. .set_dtype(1, dtype::QuantizedS8{1.3f})
  96. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  97. .set_dtype(3, dtype::QuantizedS8{1.1f})
  98. .set_dtype(4, dtype::QuantizedS8{1.0f})
  99. .set_epsilon(1 + 1e-3)
  100. .set_max_avg_error(1e-1)
  101. .set_max_avg_biased_error(1e-1);
  102. param::ConvBias param;
  103. param.pad_h = param.pad_w = 1;
  104. param.stride_h = param.stride_w = 2;
  105. param.format = param::ConvBias::Format::NCHW4;
  106. checker.set_param(param).execs(
  107. {{32, 4, 12, 12, 4},
  108. {16, 4, 3, 3, 4},
  109. {1, 4, 1, 1, 4},
  110. {32, 4, 6, 6, 4},
  111. {}});
  112. }
  113. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_1x1) {
  114. require_compute_capability(6, 1);
  115. conv_bias::check_conv_bias(
  116. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  117. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  118. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  119. conv_bias::get_int8_nchw4_args_check_bounds(1));
  120. }
  121. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_3x3) {
  122. require_compute_capability(6, 1);
  123. conv_bias::check_conv_bias(
  124. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  125. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  126. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  127. conv_bias::get_int8_nchw4_args_check_bounds(3));
  128. }
  129. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_5x5) {
  130. require_compute_capability(6, 1);
  131. conv_bias::check_conv_bias(
  132. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  133. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  134. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  135. conv_bias::get_int8_nchw4_args_check_bounds(5));
  136. }
  137. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_7x7) {
  138. require_compute_capability(6, 1);
  139. conv_bias::check_conv_bias(
  140. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  141. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  142. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  143. conv_bias::get_int8_nchw4_args_check_bounds(7));
  144. }
  145. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4) {
  146. require_compute_capability(6, 1);
  147. conv_bias::check_conv_bias(
  148. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  149. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  150. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4);
  151. }
  152. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_WITH_Z) {
  153. require_compute_capability(6, 1);
  154. Checker<ConvBiasForward> checker(handle_cuda());
  155. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  156. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  157. UniformIntRNG rng{-3, 3};
  158. UniformIntRNG bias_rng{-50, 50};
  159. checker.set_rng(0, &rng)
  160. .set_rng(1, &rng)
  161. .set_rng(2, &bias_rng)
  162. .set_rng(3, &rng)
  163. .set_dtype(0, dtype::QuantizedS8{1.2f})
  164. .set_dtype(1, dtype::QuantizedS8{1.3f})
  165. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  166. .set_dtype(3, dtype::QuantizedS8{1.1f})
  167. .set_dtype(4, dtype::QuantizedS8{1.1f})
  168. .set_epsilon(1 + 1e-3)
  169. .set_max_avg_error(1e-1)
  170. .set_max_avg_biased_error(1e-1);
  171. param::ConvBias param;
  172. param.pad_h = param.pad_w = 1;
  173. param.stride_h = param.stride_w = 1;
  174. param.format = param::ConvBias::Format::CHWN4;
  175. checker.set_param(param).execs(
  176. {{4, 12, 12, 32, 4},
  177. {4, 3, 3, 16, 4},
  178. {4, 1, 1, 1, 4},
  179. {4, 12, 12, 32, 4},
  180. {}});
  181. }
  182. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
  183. require_compute_capability(6, 1);
  184. Checker<ConvBiasForward> checker(handle_cuda());
  185. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  186. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  187. UniformIntRNG rng{-3, 3};
  188. UniformIntRNG bias_rng{-50, 50};
  189. checker.set_rng(0, &rng)
  190. .set_rng(1, &rng)
  191. .set_rng(2, &bias_rng)
  192. .set_rng(3, &rng)
  193. .set_dtype(0, dtype::QuantizedS8{1.2f})
  194. .set_dtype(1, dtype::QuantizedS8{1.3f})
  195. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  196. .set_dtype(4, dtype::QuantizedS8{0.001f})
  197. .set_epsilon(1 + 1e-3)
  198. .set_max_avg_error(1e-1)
  199. .set_max_avg_biased_error(1e-1);
  200. param::ConvBias param;
  201. param.pad_h = param.pad_w = 1;
  202. param.stride_h = param.stride_w = 1;
  203. param.format = param::ConvBias::Format::CHWN4;
  204. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  205. checker.set_param(param).execs(
  206. {{4, 12, 12, 32, 4}, {4, 3, 3, 16, 4}, {4, 1, 1, 1, 4}, {}, {}});
  207. }
  208. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
  209. require_compute_capability(6, 1);
  210. conv_bias::check_conv_bias(
  211. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  212. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  213. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  214. conv_bias::get_int8_chwn4_args_check_bounds(3));
  215. }
  216. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1) {
  217. require_compute_capability(6, 1);
  218. conv_bias::check_conv_bias(
  219. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  220. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  221. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  222. conv_bias::get_int8_chwn4_small_channel_args(1));
  223. }
  224. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_3x3) {
  225. require_compute_capability(6, 1);
  226. conv_bias::check_conv_bias(
  227. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  228. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  229. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  230. conv_bias::get_int8_chwn4_small_channel_args(3));
  231. }
  232. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5) {
  233. require_compute_capability(6, 1);
  234. conv_bias::check_conv_bias(
  235. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  236. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  237. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  238. conv_bias::get_int8_chwn4_small_channel_args(5));
  239. }
  240. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7) {
  241. require_compute_capability(6, 1);
  242. conv_bias::check_conv_bias(
  243. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  244. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  245. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  246. conv_bias::get_int8_chwn4_small_channel_args(7));
  247. }
  248. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_SMALL_CHANNEL_CHECK_BOUNDS) {
  249. require_compute_capability(6, 1);
  250. conv_bias::check_conv_bias(
  251. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  252. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  253. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  254. conv_bias::get_int8_nchw4_small_channel_args_check_bounds(3));
  255. }
  256. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1_CHECK_BOUNDS) {
  257. require_compute_capability(6, 1);
  258. conv_bias::check_conv_bias(
  259. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  260. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  261. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  262. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(1));
  263. }
  264. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5_CHECK_BOUNDS) {
  265. require_compute_capability(6, 1);
  266. conv_bias::check_conv_bias(
  267. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  268. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  269. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  270. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(5));
  271. }
  272. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7_CHECK_BOUNDS) {
  273. require_compute_capability(6, 1);
  274. conv_bias::check_conv_bias(
  275. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  276. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  277. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  278. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(7));
  279. }
  280. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_1x1) {
  281. require_compute_capability(7, 5);
  282. conv_bias::check_conv_bias(
  283. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  284. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  285. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::NCHW4,
  286. conv_bias::get_int8_nchw4_tensorcore_args(1));
  287. }
  288. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_3x3) {
  289. require_compute_capability(7, 5);
  290. conv_bias::check_conv_bias(
  291. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  292. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  293. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::NCHW4,
  294. conv_bias::get_int8_nchw4_tensorcore_args(3));
  295. }
  296. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_5x5) {
  297. require_compute_capability(7, 5);
  298. conv_bias::check_conv_bias(
  299. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  300. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  301. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::NCHW4,
  302. conv_bias::get_int8_nchw4_tensorcore_args(5));
  303. }
  304. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_7x7) {
  305. require_compute_capability(7, 5);
  306. conv_bias::check_conv_bias(
  307. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  308. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  309. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::NCHW4,
  310. conv_bias::get_int8_nchw4_tensorcore_args(7));
  311. }
  312. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  313. require_compute_capability(7, 5);
  314. conv_bias::check_conv_bias(
  315. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  316. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  317. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::NCHW4,
  318. conv_bias::get_int8_nchw4_args_check_bounds(3));
  319. }
  320. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  321. require_compute_capability(7, 5);
  322. conv_bias::check_conv_bias(
  323. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  324. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  325. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma8x32x16", param::ConvBias::Format::NCHW4,
  326. conv_bias::get_int8_nchw4_args_check_bounds(3));
  327. }
  328. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  329. require_compute_capability(7, 5);
  330. conv_bias::check_conv_bias(
  331. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  332. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  333. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma32x8x16", param::ConvBias::Format::NCHW4,
  334. conv_bias::get_int8_nchw4_args_check_bounds(3));
  335. }
  336. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_0) {
  337. require_compute_capability(7, 5);
  338. conv_bias::check_conv_bias(
  339. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  340. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  341. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::CHWN4,
  342. conv_bias::get_int8_chwn4_tensorcore_args(3));
  343. }
  344. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_1) {
  345. require_compute_capability(7, 5);
  346. conv_bias::check_conv_bias(
  347. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  348. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  349. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma32x8x16", param::ConvBias::Format::CHWN4,
  350. conv_bias::get_int8_chwn4_tensorcore_args(3));
  351. }
  352. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_2) {
  353. require_compute_capability(7, 5);
  354. conv_bias::check_conv_bias(
  355. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  356. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  357. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma8x32x16", param::ConvBias::Format::CHWN4,
  358. conv_bias::get_int8_chwn4_tensorcore_args(3));
  359. }
  360. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_1x1) {
  361. require_compute_capability(7, 5);
  362. conv_bias::check_conv_bias(
  363. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  364. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  365. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::CHWN4,
  366. conv_bias::get_int8_chwn4_args_check_bounds(1));
  367. }
  368. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_5x5) {
  369. require_compute_capability(7, 5);
  370. conv_bias::check_conv_bias(
  371. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  372. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  373. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::CHWN4,
  374. conv_bias::get_int8_chwn4_args_check_bounds(5));
  375. }
  376. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_7x7) {
  377. require_compute_capability(7, 5);
  378. conv_bias::check_conv_bias(
  379. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  380. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  381. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::CHWN4,
  382. conv_bias::get_int8_chwn4_args_check_bounds(7));
  383. }
  384. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_WITH_Z) {
  385. require_compute_capability(7, 5);
  386. Checker<ConvBiasForward> checker(handle_cuda());
  387. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  388. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  389. UniformIntRNG rng{-3, 3};
  390. UniformIntRNG bias_rng{-50, 50};
  391. checker.set_rng(0, &rng)
  392. .set_rng(1, &rng)
  393. .set_rng(2, &bias_rng)
  394. .set_rng(3, &rng)
  395. .set_dtype(0, dtype::QuantizedS8{1.2f})
  396. .set_dtype(1, dtype::QuantizedS8{1.3f})
  397. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  398. .set_dtype(3, dtype::QuantizedS8{1.1f})
  399. .set_dtype(4, dtype::QuantizedS8{1.0f})
  400. .set_epsilon(1 + 1e-3)
  401. .set_max_avg_error(1e-1)
  402. .set_max_avg_biased_error(1e-1);
  403. param::ConvBias param;
  404. param.pad_h = param.pad_w = 1;
  405. param.stride_h = param.stride_w = 1;
  406. param.format = param::ConvBias::Format::NCHW4;
  407. checker.set_param(param).execs(
  408. {{64, 8, 12, 12, 4},
  409. {64, 8, 3, 3, 4},
  410. {1, 16, 1, 1, 4},
  411. {64, 16, 12, 12, 4},
  412. {}});
  413. }
  414. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_WITH_Z) {
  415. require_compute_capability(7, 5);
  416. Checker<ConvBiasForward> checker(handle_cuda());
  417. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  418. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  419. UniformIntRNG rng{-3, 3};
  420. UniformIntRNG bias_rng{-50, 50};
  421. checker.set_rng(0, &rng)
  422. .set_rng(1, &rng)
  423. .set_rng(2, &bias_rng)
  424. .set_rng(3, &rng)
  425. .set_dtype(0, dtype::QuantizedS8{1.2f})
  426. .set_dtype(1, dtype::QuantizedS8{1.3f})
  427. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  428. .set_dtype(3, dtype::QuantizedS8{1.1f})
  429. .set_dtype(4, dtype::QuantizedS8{1.0f})
  430. .set_epsilon(1 + 1e-3)
  431. .set_max_avg_error(1e-1)
  432. .set_max_avg_biased_error(1e-1);
  433. param::ConvBias param;
  434. param.pad_h = param.pad_w = 1;
  435. param.stride_h = param.stride_w = 1;
  436. param.format = param::ConvBias::Format::CHWN4;
  437. checker.set_param(param).execs(
  438. {{8, 12, 12, 64, 4},
  439. {8, 3, 3, 64, 4},
  440. {16, 1, 1, 1, 4},
  441. {16, 12, 12, 64, 4},
  442. {}});
  443. }
  444. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  445. require_compute_capability(7, 5);
  446. conv_bias::check_conv_bias(
  447. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  448. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  449. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  450. param::ConvBias::Format::CHWN4,
  451. conv_bias::get_int8_chwn4_args_check_bounds(3));
  452. }
  453. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  454. require_compute_capability(7, 5);
  455. conv_bias::check_conv_bias(
  456. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  457. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  458. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  459. param::ConvBias::Format::CHWN4,
  460. conv_bias::get_int8_chwn4_args_check_bounds(3));
  461. }
  462. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  463. require_compute_capability(7, 5);
  464. conv_bias::check_conv_bias(
  465. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  466. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  467. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  468. param::ConvBias::Format::CHWN4,
  469. conv_bias::get_int8_chwn4_args_check_bounds(3));
  470. }
  471. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_0) {
  472. require_compute_capability(7, 5);
  473. conv_bias::check_conv_bias(
  474. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  475. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  476. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  477. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  478. }
  479. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_1) {
  480. require_compute_capability(7, 5);
  481. conv_bias::check_conv_bias(
  482. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  483. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  484. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  485. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  486. }
  487. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_2) {
  488. require_compute_capability(7, 5);
  489. conv_bias::check_conv_bias(
  490. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  491. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  492. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  493. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  494. }
  495. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_0) {
  496. require_compute_capability(7, 5);
  497. conv_bias::check_conv_bias(
  498. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  499. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  500. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  501. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  502. }
  503. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_1) {
  504. require_compute_capability(7, 5);
  505. conv_bias::check_conv_bias(
  506. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  507. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  508. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  509. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  510. }
  511. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_2) {
  512. require_compute_capability(7, 5);
  513. conv_bias::check_conv_bias(
  514. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  515. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  516. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  517. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  518. }
  519. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1) {
  520. require_compute_capability(7, 5);
  521. conv_bias::check_conv_bias(
  522. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  523. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  524. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  525. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(1));
  526. }
  527. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5) {
  528. require_compute_capability(7, 5);
  529. conv_bias::check_conv_bias(
  530. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  531. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  532. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  533. param::ConvBias::Format::CHWN4,
  534. conv_bias::get_int8_chwn4_args_small_batch(5));
  535. }
  536. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_7x7) {
  537. require_compute_capability(7, 5);
  538. conv_bias::check_conv_bias(
  539. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  540. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  541. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  542. param::ConvBias::Format::CHWN4,
  543. conv_bias::get_int8_chwn4_args_small_batch(7));
  544. }
  545. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_1) {
  546. require_compute_capability(7, 5);
  547. conv_bias::check_conv_bias(
  548. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  549. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  550. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  551. param::ConvBias::Format::CHWN4,
  552. conv_bias::get_int8_chwn4_args_small_batch(5));
  553. }
  554. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_2) {
  555. require_compute_capability(7, 5);
  556. conv_bias::check_conv_bias(
  557. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  558. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  559. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  560. param::ConvBias::Format::CHWN4,
  561. conv_bias::get_int8_chwn4_args_small_batch(5));
  562. }
  563. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_1) {
  564. require_compute_capability(7, 5);
  565. conv_bias::check_conv_bias(
  566. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  567. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  568. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  569. param::ConvBias::Format::CHWN4,
  570. conv_bias::get_int8_chwn4_args_small_batch(1));
  571. }
  572. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
  573. require_compute_capability(7, 5);
  574. conv_bias::check_conv_bias(
  575. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  576. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  577. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  578. param::ConvBias::Format::CHWN4,
  579. conv_bias::get_int8_chwn4_args_small_batch(1));
  580. }
  581. TEST_F(CUDA, FALLBACK_CONV_QS8) {
  582. require_compute_capability_eq(7, 5);
  583. Checker<ConvBiasForward> checker(handle_cuda());
  584. auto check = [&checker](const std::string&& algo, const std::string&& sub_algo) {
  585. checker.set_before_exec_callback(
  586. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  587. {algo.c_str(), {sub_algo.c_str()}}));
  588. UniformIntRNG rng{-3, 3};
  589. UniformIntRNG bias_rng{-50, 50};
  590. checker.set_rng(0, &rng)
  591. .set_rng(1, &rng)
  592. .set_rng(2, &bias_rng)
  593. .set_rng(3, &rng)
  594. .set_dtype(0, dtype::QuantizedS8{1.2f})
  595. .set_dtype(1, dtype::QuantizedS8{1.3f})
  596. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  597. .set_dtype(3, dtype::QuantizedS8{19.990229f})
  598. .set_dtype(4, dtype::QuantizedS8{19.990228f})
  599. .set_epsilon(1e-3)
  600. .set_max_avg_error(1e-1)
  601. .set_max_avg_biased_error(1e-3);
  602. param::ConvBias param;
  603. param.pad_h = param.pad_w = 1;
  604. param.stride_h = param.stride_w = 2;
  605. param.format = param::ConvBias::Format::NCHW;
  606. checker.set_param(param).execs(
  607. {{16, 15, 14, 14}, {28, 15, 3, 3}, {1, 28, 1, 1}, {16, 28, 7, 7}, {}});
  608. checker.set_param(param).execs(
  609. {{16, 32, 14, 14}, {32, 32, 3, 3}, {1, 32, 1, 1}, {}, {}});
  610. };
  611. check("FALLBACK_CONV_NCHW_QS8", "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM");
  612. }
  613. TEST_F(CUDA, FALLBACK_CONV_QS8_F32) {
  614. require_compute_capability_eq(7, 5);
  615. Checker<ConvBiasForward> checker(handle_cuda());
  616. auto check = [&checker](const std::string&& algo, const std::string&& sub_algo) {
  617. checker.set_before_exec_callback(
  618. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  619. {algo.c_str(), {sub_algo.c_str()}}));
  620. UniformIntRNG rng{-3, 3};
  621. UniformFloatRNG bias_rng{-50.f, 50.f};
  622. checker.set_rng(0, &rng)
  623. .set_rng(1, &rng)
  624. .set_rng(2, &bias_rng)
  625. .set_rng(3, &rng)
  626. .set_dtype(0, dtype::QuantizedS8{1.2f})
  627. .set_dtype(1, dtype::QuantizedS8{1.3f})
  628. .set_dtype(2, dtype::Float32{})
  629. .set_dtype(3, dtype::Float32{})
  630. .set_dtype(4, dtype::Float32{})
  631. .set_epsilon(1e-3)
  632. .set_max_avg_error(1e-1)
  633. .set_max_avg_biased_error(1e-3);
  634. param::ConvBias param;
  635. param.pad_h = param.pad_w = 1;
  636. param.stride_h = param.stride_w = 2;
  637. param.format = param::ConvBias::Format::NCHW;
  638. checker.set_param(param).execs(
  639. {{16, 15, 14, 14}, {28, 15, 3, 3}, {1, 28, 1, 1}, {16, 28, 7, 7}, {}});
  640. checker.set_param(param).execs(
  641. {{16, 32, 14, 14}, {32, 32, 3, 3}, {1, 32, 1, 1}, {}, {}});
  642. };
  643. check("FALLBACK_CONV_NCHW_QS8", "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM");
  644. }
  645. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_WEIGHT_PREPROCESS) {
  646. require_compute_capability(6, 1);
  647. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  648. handle_cuda());
  649. auto check = [&checker](const std::string& algo) {
  650. checker.set_before_exec_callback(
  651. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  652. UniformIntRNG rng{-16, 16};
  653. UniformIntRNG bias_rng{-50, 50};
  654. UniformIntRNG const_rng{1, 1};
  655. checker.set_rng(0, &rng)
  656. .set_rng(1, &rng)
  657. .set_rng(2, &bias_rng)
  658. .set_rng(3, &rng)
  659. .set_dtype(0, dtype::QuantizedS8{1.2f})
  660. .set_dtype(1, dtype::QuantizedS8{1.3f})
  661. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  662. .set_dtype(3, dtype::QuantizedS8{1.3f})
  663. .set_dtype(4, dtype::QuantizedS8{1.0f})
  664. .set_epsilon(1 + 1e-3)
  665. .set_max_avg_error(1e-1)
  666. .set_max_avg_biased_error(1e-3);
  667. param::ConvBias param;
  668. param.pad_h = param.pad_w = 1;
  669. param.stride_h = param.stride_w = 2;
  670. param.format = param::ConvBias::Format::NCHW4;
  671. checker.set_param(param).execs(
  672. {{16, 4, 14, 14, 4}, {16, 4, 3, 3, 4}, {1, 4, 1, 1, 4}, {}, {}});
  673. };
  674. check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32X32_64X32X32");
  675. check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16X64X8_16X64X8");
  676. }
  677. #if CUDA_VERSION >= 10020
  678. /// \note: we only check several cases and block sizes in megdnn_test, the
  679. /// full testcases are written in cutlass repository
  680. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
  681. require_compute_capability_eq(7, 5);
  682. Checker<ConvBiasForward> checker(handle_cuda());
  683. auto check = [&checker](const std::string& algo) {
  684. checker.set_before_exec_callback(
  685. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  686. UniformIntRNG rng{-8, 8};
  687. UniformIntRNG bias_rng{-50, 50};
  688. UniformIntRNG const_rng{1, 1};
  689. // use scale that are all integers to avoid rouding error
  690. checker.set_rng(0, &rng)
  691. .set_rng(1, &rng)
  692. .set_rng(2, &bias_rng)
  693. .set_rng(3, &rng)
  694. .set_dtype(0, dtype::QuantizedS8{6.0f})
  695. .set_dtype(1, dtype::QuantizedS8{1.0f})
  696. .set_dtype(2, dtype::QuantizedS32{6.0f})
  697. .set_dtype(3, dtype::QuantizedS8{1.0f})
  698. .set_dtype(4, dtype::QuantizedS8{6.0f})
  699. .set_epsilon(1e-3);
  700. param::ConvBias param;
  701. param.pad_h = param.pad_w = 1;
  702. param.stride_h = param.stride_w = 1;
  703. param.format = param::ConvBias::Format::NCHW32;
  704. checker.set_param(param).execs(
  705. {{16, 8, 7, 7, 32}, {256, 8, 3, 3, 32}, {1, 8, 1, 1, 32}, {}, {}});
  706. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  707. checker.set_param(param).execs(
  708. {{16, 8, 7, 7, 32}, {256, 8, 1, 1, 32}, {1, 8, 1, 1, 32}, {}, {}});
  709. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  710. checker.set_param(param).execs(
  711. {{16, 8, 7, 7, 32}, {256, 8, 3, 3, 32}, {1, 8, 1, 1, 32}, {}, {}});
  712. // use non integer scale
  713. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  714. checker.set_dtype(0, dtype::QuantizedS8{1.1f})
  715. .set_dtype(1, dtype::QuantizedS8{1.2f})
  716. .set_dtype(2, dtype::QuantizedS32{1.1f * 1.2f})
  717. .set_dtype(3, dtype::QuantizedS8{1.1f})
  718. .set_dtype(4, dtype::QuantizedS8{6.0f})
  719. .set_epsilon(1 + 1e-3)
  720. .set_max_avg_error(1e-1)
  721. .set_max_avg_biased_error(1e-1)
  722. .execs({{16, 8, 7, 7, 32},
  723. {256, 8, 3, 3, 32},
  724. {1, 8, 1, 1, 32},
  725. {16, 8, 7, 7, 32},
  726. {}});
  727. };
  728. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  729. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_128X128X64_64X64X64_2",
  730. ConvBias::DirectParam{});
  731. check(algo);
  732. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  733. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1",
  734. ConvBias::DirectParam{});
  735. check(algo);
  736. }
  737. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC) {
  738. require_compute_capability(7, 5);
  739. Checker<ConvBiasForward> checker(handle_cuda());
  740. auto check = [&checker](const std::string& algo) {
  741. checker.set_before_exec_callback(
  742. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  743. UniformIntRNG rng{-8, 8};
  744. UniformIntRNG bias_rng{-50, 50};
  745. checker.set_rng(0, &rng)
  746. .set_rng(1, &rng)
  747. .set_rng(2, &bias_rng)
  748. .set_rng(3, &rng)
  749. .set_dtype(0, dtype::QuantizedS8{1.2f})
  750. .set_dtype(1, dtype::QuantizedS8{1.3f})
  751. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  752. .set_dtype(3, dtype::QuantizedS8{19.990229f})
  753. .set_dtype(4, dtype::QuantizedS8{19.990228f})
  754. .set_epsilon(1e-3);
  755. param::ConvBias param;
  756. param.pad_h = param.pad_w = 1;
  757. param.stride_h = param.stride_w = 1;
  758. param.format = param::ConvBias::Format::NHWC;
  759. checker.set_param(param).execs(
  760. {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
  761. param.pad_h = param.pad_w = 0;
  762. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  763. checker.set_param(param).execs(
  764. {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
  765. };
  766. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  767. "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
  768. ConvBias::DirectParam{});
  769. check(algo);
  770. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  771. "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
  772. ConvBias::DirectParam{});
  773. check(algo);
  774. }
  775. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_UINT4_WEIGHT_PREPROCESS) {
  776. require_compute_capability(7, 5);
  777. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  778. handle_cuda());
  779. auto check = [&checker](const std::string& algo) {
  780. checker.set_before_exec_callback(
  781. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  782. UniformIntRNG rng{-8, 8};
  783. UniformIntRNG bias_rng{-50, 50};
  784. UniformIntRNG rng_u4{0, 15};
  785. checker.set_rng(0, &rng)
  786. .set_rng(1, &rng)
  787. .set_rng(2, &bias_rng)
  788. .set_rng(3, &rng_u4)
  789. .set_dtype(0, dtype::QuantizedS8{0.2f})
  790. .set_dtype(1, dtype::QuantizedS8{0.3f})
  791. .set_dtype(2, dtype::QuantizedS32{0.2f * 0.3f})
  792. .set_dtype(3, dtype::Quantized4Asymm{0.5f, 8})
  793. .set_dtype(4, dtype::Quantized4Asymm{0.5f, 4})
  794. .set_epsilon(1 + 1e-3);
  795. param::ConvBias param;
  796. param.pad_h = param.pad_w = 1;
  797. param.stride_h = param.stride_w = 1;
  798. param.format = param::ConvBias::Format::NHWC;
  799. checker.set_param(param).execs(
  800. {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
  801. param.pad_h = param.pad_w = 0;
  802. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  803. checker.set_param(param).execs(
  804. {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
  805. };
  806. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  807. "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
  808. ConvBias::DirectParam{});
  809. check(algo);
  810. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  811. "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
  812. ConvBias::DirectParam{});
  813. check(algo);
  814. }
  815. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_FLOAT) {
  816. require_compute_capability(7, 5);
  817. Checker<ConvBiasForward> checker(handle_cuda());
  818. auto check = [&checker](const std::string& algo) {
  819. checker.set_before_exec_callback(
  820. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  821. UniformIntRNG rng{-8, 8};
  822. UniformFloatRNG float_rng{-50, 50};
  823. checker.set_rng(0, &rng)
  824. .set_rng(1, &rng)
  825. .set_rng(2, &float_rng)
  826. .set_rng(3, &float_rng)
  827. .set_dtype(0, dtype::QuantizedS8(1.9980618f))
  828. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  829. .set_dtype(2, dtype::Float32())
  830. .set_dtype(3, dtype::Float32())
  831. .set_dtype(4, dtype::Float32());
  832. param::ConvBias param;
  833. param.pad_h = param.pad_w = 1;
  834. param.stride_h = param.stride_w = 1;
  835. param.format = param::ConvBias::Format::NHWC;
  836. checker.set_param(param).execs(
  837. {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
  838. param.pad_h = param.pad_w = 0;
  839. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  840. checker.set_param(param).execs(
  841. {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
  842. };
  843. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  844. "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
  845. ConvBias::DirectParam{});
  846. check(algo);
  847. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  848. "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
  849. ConvBias::DirectParam{});
  850. check(algo);
  851. }
  852. #endif
  853. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) {
  854. require_compute_capability(6, 1);
  855. using namespace conv_bias;
  856. Checker<ConvBiasForward> checker(handle_cuda());
  857. UniformIntRNG int_rng{-3, 3};
  858. UniformFloatRNG float_rng{-50, 50};
  859. ConvBias::Param param;
  860. param.format = ConvBias::Param::Format::NCHW4_NCHW;
  861. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  862. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  863. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  864. checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  865. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  866. .set_dtype(2, dtype::Float32())
  867. .set_dtype(3, dtype::Float32())
  868. .set_dtype(4, dtype::Float32())
  869. .set_rng(0, &int_rng)
  870. .set_rng(1, &int_rng)
  871. .set_rng(2, &float_rng)
  872. .set_rng(3, &float_rng)
  873. .set_param(param);
  874. auto opr = handle_cuda()->create_operator<ConvBias>();
  875. auto run = [&](const TensorShapeArray& shapes) {
  876. opr->param() = param;
  877. TensorLayout dst_layout;
  878. opr->deduce_layout(
  879. {shapes[0], dtype::Float32()}, {shapes[1], dtype::Float32()}, {}, {},
  880. dst_layout);
  881. checker.execs({shapes[0], shapes[1], shapes[2], dst_layout, {}});
  882. };
  883. run({{16, 4, 23, 40, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}});
  884. run({{16, 4, 92, 160, 4}, {24, 4, 3, 3, 4}, {1, 24, 1, 1}});
  885. run({{16, 4, 92, 160, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}});
  886. run({{16, 4, 92, 160, 4}, {16, 4, 3, 3, 4}, {1, 16, 1, 1}});
  887. run({{16, 4, 92, 160, 4}, {8, 4, 3, 3, 4}, {1, 8, 1, 1}});
  888. run({{16, 4, 46, 80, 4}, {4, 4, 3, 3, 4}, {1, 4, 1, 1}});
  889. }
  890. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW32) {
  891. require_compute_capability(6, 1);
  892. using namespace conv_bias;
  893. Checker<ConvBiasForward> checker(handle_cuda());
  894. UniformIntRNG int_rng{-3, 3};
  895. UniformIntRNG bias_rng{-50, 50};
  896. ConvBias::Param param;
  897. param.format = ConvBias::Param::Format::NCHW4_NCHW32;
  898. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  899. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  900. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  901. checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  902. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  903. .set_dtype(2, dtype::QuantizedS32(1.9980618f * 1.9980927f))
  904. .set_dtype(3, dtype::QuantizedS8(1.9980618f))
  905. .set_dtype(4, dtype::QuantizedS8(1.9980618f))
  906. .set_rng(0, &int_rng)
  907. .set_rng(1, &int_rng)
  908. .set_rng(2, &bias_rng)
  909. .set_rng(3, &int_rng)
  910. .set_param(param);
  911. auto run = [&](const TensorShapeArray& shapes) {
  912. checker.execs({shapes[0], shapes[1], shapes[2], {}, {}});
  913. };
  914. run({{16, 4, 23, 40, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}});
  915. run({{16, 4, 92, 160, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}});
  916. run({{16, 4, 46, 80, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}});
  917. }
  918. #if CUDA_VERSION >= 10020
  919. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) {
  920. require_compute_capability(7, 5);
  921. using namespace conv_bias;
  922. Checker<ConvBiasForward> checker(handle_cuda());
  923. UniformIntRNG int_rng{-3, 3};
  924. UniformIntRNG bias_rng{-50, 50};
  925. ConvBias::Param param;
  926. param.format = ConvBias::Param::Format::NCHW32_NCHW4;
  927. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  928. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  929. ConvBias::algo_name<ConvBias::DirectParam>(
  930. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X128X32_32X64X32_1",
  931. ConvBias::DirectParam{})
  932. .c_str()));
  933. checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  934. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  935. .set_dtype(2, dtype::QuantizedS32(1.9980618f * 1.9980927f))
  936. .set_dtype(3, dtype::QuantizedS8(1.9980618f))
  937. .set_dtype(4, dtype::QuantizedS8(1.9980618f))
  938. .set_rng(0, &int_rng)
  939. .set_rng(1, &int_rng)
  940. .set_rng(2, &bias_rng)
  941. .set_rng(3, &int_rng)
  942. .set_param(param);
  943. auto run = [&](const TensorShapeArray& shapes) {
  944. checker.execs({shapes[0], shapes[1], shapes[2], {}, {}});
  945. };
  946. run({{16, 2, 23, 40, 32}, {20, 2, 3, 3, 32}, {1, 5, 1, 1, 4}});
  947. run({{16, 1, 92, 160, 32}, {24, 1, 3, 3, 32}, {1, 6, 1, 1, 4}});
  948. run({{16, 2, 46, 80, 32}, {4, 2, 3, 3, 32}, {1, 1, 1, 1, 4}});
  949. }
  950. #endif
  951. #if MEGDNN_WITH_BENCHMARK
  952. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) {
  953. require_compute_capability(6, 1);
  954. benchmark_target_algo(
  955. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  956. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  957. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  958. param::ConvBias::Format::CHWN4);
  959. }
  960. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4) {
  961. require_compute_capability(6, 1);
  962. benchmark_target_algo(
  963. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  964. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  965. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  966. param::ConvBias::Format::NCHW4);
  967. }
  968. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE) {
  969. require_compute_capability(7, 5);
  970. benchmark_target_algo_with_cudnn_tsc(
  971. handle_cuda(), get_resnet50_bench_args(256), dtype::QuantizedS8{1.2f},
  972. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  973. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  974. param::ConvBias::Format::CHWN4);
  975. }
  976. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE_ALL_ALGO) {
  977. require_compute_capability(7, 5);
  978. benchmark_target_algo_with_cudnn_tsc(
  979. handle_cuda(), get_resnet50_bench_args(256), dtype::QuantizedS8{1.2f},
  980. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  981. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  982. }
  983. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_DET_ALL_ALGO) {
  984. require_compute_capability(7, 5);
  985. benchmark_target_algo_with_cudnn_tsc(
  986. handle_cuda(), get_detection_bench_args(), dtype::QuantizedS8{1.2f},
  987. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  988. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  989. }
  990. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_TENSORCORE) {
  991. require_compute_capability(7, 5);
  992. benchmark_target_algo_with_cudnn_tsc(
  993. handle_cuda(), get_resnet50_bench_args(256), dtype::QuantizedS8{1.2f},
  994. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  995. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  996. param::ConvBias::Format::NCHW4);
  997. }
  998. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
  999. require_compute_capability(6, 1);
  1000. std::vector<BenchArgs> args;
  1001. args.push_back(BenchArgs{64, 4, 224, 224, 64, 7, 2});
  1002. benchmark_target_algo(
  1003. handle_cuda(), args, dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1004. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1005. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4);
  1006. }
  1007. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) {
  1008. CUBenchmarker<ConvBiasForward> benchmarker(handle_cuda());
  1009. size_t RUNS = 1000;
  1010. benchmarker.set_display(false).set_times(RUNS);
  1011. using namespace conv_bias;
  1012. UniformIntRNG int_rng{-3, 3};
  1013. UniformIntRNG bias_rng{-50, 50};
  1014. ConvBias::Param param;
  1015. param.format = ConvBias::Param::Format::NCHW4_NCHW;
  1016. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  1017. benchmarker.set_before_exec_callback(
  1018. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1019. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  1020. benchmarker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  1021. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  1022. .set_dtype(2, dtype::Float32())
  1023. .set_dtype(3, dtype::Float32())
  1024. .set_dtype(4, dtype::Float32())
  1025. .set_rng(0, &int_rng)
  1026. .set_rng(1, &int_rng)
  1027. .set_param(param);
  1028. auto run = [&](const TensorShapeArray& shapes) {
  1029. auto time_in_ms =
  1030. benchmarker.execs({shapes[0], shapes[1], shapes[2], {}, {}}) / RUNS;
  1031. printf("src=%s, filter=%s, dst=%s, time=%.2f\n", shapes[0].to_string().c_str(),
  1032. shapes[1].to_string().c_str(), shapes[2].to_string().c_str(),
  1033. time_in_ms);
  1034. };
  1035. run({{16, 16, 224, 224, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
  1036. run({{16, 16, 92, 160, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
  1037. run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
  1038. }
  1039. #if CUDA_VERSION >= 10020
  1040. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
  1041. require_compute_capability(7, 5);
  1042. benchmark_target_algo_with_cudnn_tsc(
  1043. handle_cuda(), get_resnet50_bench_args(256), dtype::QuantizedS8{1.2f},
  1044. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1045. dtype::QuantizedS8{1.0f}, "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
  1046. param::ConvBias::Format::NCHW32);
  1047. }
  1048. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NHWC) {
  1049. require_compute_capability(7, 5);
  1050. benchmark_target_algo_with_cudnn_tsc(
  1051. handle_cuda(), get_det_first_bench_args(16), dtype::QuantizedS8{1.2f},
  1052. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1053. dtype::QuantizedS8{1.0f}, "DIRECT:INT8_NHWC_IMMA_IMPLICIT_GEMM",
  1054. param::ConvBias::Format::NHWC);
  1055. }
  1056. #endif
  1057. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
  1058. require_compute_capability(6, 1);
  1059. benchmark_target_algo(
  1060. handle_cuda(), get_resnet50_bench_args(64), dtype::QuantizedS8{1.2f},
  1061. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1062. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  1063. param::ConvBias::Format::NCHW4);
  1064. }
  1065. TEST_F(CUDA, BENCHMARK_SASS_CONV_BIAS_INT8_NCHW4_DET_FIRST) {
  1066. require_compute_capability(6, 1);
  1067. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1068. "SASS_INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32_64", ConvBias::DirectParam{});
  1069. benchmark_target_algo(
  1070. handle_cuda(), get_det_first_bench_args(16), dtype::QuantizedS8{1.2f},
  1071. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1072. dtype::QuantizedS8{1.0f}, algo.c_str(), param::ConvBias::Format::NCHW4);
  1073. }
  1074. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4_DET_FIRST) {
  1075. require_compute_capability(6, 1);
  1076. benchmark_target_algo(
  1077. handle_cuda(), get_det_first_bench_args(16), dtype::QuantizedS8{1.2f},
  1078. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1079. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16",
  1080. param::ConvBias::Format::NCHW4);
  1081. }
  1082. #endif
  1083. } // namespace conv
  1084. } // namespace test
  1085. } // namespace megdnn
  1086. // vim: syntax=cpp.doxygen