You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_int8.cpp 44 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024
  1. /**
  2. * \file dnn/test/cuda/conv_bias_int8.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/oprs/nn.h"
  13. #include "src/common/utils.h"
  14. #include "src/cuda/cudnn_with_check.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/conv_bias.h"
  17. #include "test/cuda/benchmark.h"
  18. #include "test/cuda/fixture.h"
  19. #include "test/cuda/utils.h"
  20. #include "test/common/tensor.h"
  21. #include "test/common/workspace_wrapper.h"
  22. #include "test/cuda/conv_test_utils.h"
  23. namespace megdnn {
  24. namespace test {
  25. namespace conv{
  26. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) {
  27. require_compute_capability(6, 1);
  28. conv_bias::check_conv_bias(
  29. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  30. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  31. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  32. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(1));
  33. }
  34. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_3x3) {
  35. require_compute_capability(6, 1);
  36. conv_bias::check_conv_bias(
  37. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  38. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  39. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  40. param::ConvBias::Format::NCHW4);
  41. }
  42. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_5x5) {
  43. require_compute_capability(6, 1);
  44. conv_bias::check_conv_bias(
  45. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  46. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  47. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  48. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(5));
  49. }
  50. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_7x7) {
  51. require_compute_capability(6, 1);
  52. conv_bias::check_conv_bias(
  53. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  54. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  55. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  56. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(7));
  57. }
  58. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_WITH_Z) {
  59. require_compute_capability(6, 1);
  60. Checker<ConvBiasForward> checker(handle_cuda());
  61. checker.set_before_exec_callback(
  62. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  63. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  64. UniformIntRNG rng{-3, 3};
  65. UniformIntRNG bias_rng{-50, 50};
  66. checker.set_rng(0, &rng)
  67. .set_rng(1, &rng)
  68. .set_rng(2, &bias_rng)
  69. .set_rng(3, &rng)
  70. .set_dtype(0, dtype::QuantizedS8{1.2f})
  71. .set_dtype(1, dtype::QuantizedS8{1.3f})
  72. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  73. .set_dtype(3, dtype::QuantizedS8{1.1f})
  74. .set_dtype(4, dtype::QuantizedS8{1.0f})
  75. .set_epsilon(1 + 1e-3)
  76. .set_max_avg_error(1e-1)
  77. .set_max_avg_biased_error(1e-1);
  78. param::ConvBias param;
  79. param.pad_h = param.pad_w = 1;
  80. param.stride_h = param.stride_w = 1;
  81. param.format = param::ConvBias::Format::NCHW4;
  82. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  83. {16, 4, 3, 3, 4},
  84. {1, 4, 1, 1, 4},
  85. {32, 4, 12, 12, 4},
  86. {}});
  87. }
  88. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_STRIDE2_WITH_Z) {
  89. require_compute_capability(6, 1);
  90. Checker<ConvBiasForward> checker(handle_cuda());
  91. checker.set_before_exec_callback(
  92. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  93. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  94. UniformIntRNG rng{-3, 3};
  95. UniformIntRNG bias_rng{-50, 50};
  96. checker.set_rng(0, &rng)
  97. .set_rng(1, &rng)
  98. .set_rng(2, &bias_rng)
  99. .set_rng(3, &rng)
  100. .set_dtype(0, dtype::QuantizedS8{1.2f})
  101. .set_dtype(1, dtype::QuantizedS8{1.3f})
  102. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  103. .set_dtype(3, dtype::QuantizedS8{1.1f})
  104. .set_dtype(4, dtype::QuantizedS8{1.0f})
  105. .set_epsilon(1 + 1e-3)
  106. .set_max_avg_error(1e-1)
  107. .set_max_avg_biased_error(1e-1);
  108. param::ConvBias param;
  109. param.pad_h = param.pad_w = 1;
  110. param.stride_h = param.stride_w = 2;
  111. param.format = param::ConvBias::Format::NCHW4;
  112. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  113. {16, 4, 3, 3, 4},
  114. {1, 4, 1, 1, 4},
  115. {32, 4, 6, 6, 4},
  116. {}});
  117. }
  118. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_1x1) {
  119. require_compute_capability(6, 1);
  120. conv_bias::check_conv_bias(
  121. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  122. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  123. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  124. param::ConvBias::Format::NCHW4,
  125. conv_bias::get_int8_nchw4_args_check_bounds(1));
  126. }
  127. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_3x3) {
  128. require_compute_capability(6, 1);
  129. conv_bias::check_conv_bias(
  130. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  131. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  132. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  133. param::ConvBias::Format::NCHW4,
  134. conv_bias::get_int8_nchw4_args_check_bounds(3));
  135. }
  136. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_5x5) {
  137. require_compute_capability(6, 1);
  138. conv_bias::check_conv_bias(
  139. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  140. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  141. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  142. param::ConvBias::Format::NCHW4,
  143. conv_bias::get_int8_nchw4_args_check_bounds(5));
  144. }
  145. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_7x7) {
  146. require_compute_capability(6, 1);
  147. conv_bias::check_conv_bias(
  148. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  149. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  150. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  151. param::ConvBias::Format::NCHW4,
  152. conv_bias::get_int8_nchw4_args_check_bounds(7));
  153. }
  154. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4) {
  155. require_compute_capability(6, 1);
  156. conv_bias::check_conv_bias(
  157. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  158. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  159. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  160. param::ConvBias::Format::CHWN4);
  161. }
  162. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_WITH_Z) {
  163. require_compute_capability(6, 1);
  164. Checker<ConvBiasForward> checker(handle_cuda());
  165. checker.set_before_exec_callback(
  166. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  167. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  168. UniformIntRNG rng{-3, 3};
  169. UniformIntRNG bias_rng{-50, 50};
  170. checker.set_rng(0, &rng)
  171. .set_rng(1, &rng)
  172. .set_rng(2, &bias_rng)
  173. .set_rng(3, &rng)
  174. .set_dtype(0, dtype::QuantizedS8{1.2f})
  175. .set_dtype(1, dtype::QuantizedS8{1.3f})
  176. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  177. .set_dtype(3, dtype::QuantizedS8{1.1f})
  178. .set_dtype(4, dtype::QuantizedS8{1.1f})
  179. .set_epsilon(1 + 1e-3)
  180. .set_max_avg_error(1e-1)
  181. .set_max_avg_biased_error(1e-1);
  182. param::ConvBias param;
  183. param.pad_h = param.pad_w = 1;
  184. param.stride_h = param.stride_w = 1;
  185. param.format = param::ConvBias::Format::CHWN4;
  186. checker.set_param(param).execs({{4, 12, 12, 32, 4},
  187. {4, 3, 3, 16, 4},
  188. {4, 1, 1, 1, 4},
  189. {4, 12, 12, 32, 4},
  190. {}});
  191. }
  192. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
  193. require_compute_capability(6, 1);
  194. Checker<ConvBiasForward> checker(handle_cuda());
  195. checker.set_before_exec_callback(
  196. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  197. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  198. UniformIntRNG rng{-3, 3};
  199. UniformIntRNG bias_rng{-50, 50};
  200. checker.set_rng(0, &rng)
  201. .set_rng(1, &rng)
  202. .set_rng(2, &bias_rng)
  203. .set_rng(3, &rng)
  204. .set_dtype(0, dtype::QuantizedS8{1.2f})
  205. .set_dtype(1, dtype::QuantizedS8{1.3f})
  206. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  207. .set_dtype(4, dtype::QuantizedS8{0.001f})
  208. .set_epsilon(1 + 1e-3)
  209. .set_max_avg_error(1e-1)
  210. .set_max_avg_biased_error(1e-1);
  211. param::ConvBias param;
  212. param.pad_h = param.pad_w = 1;
  213. param.stride_h = param.stride_w = 1;
  214. param.format = param::ConvBias::Format::CHWN4;
  215. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  216. checker.set_param(param).execs(
  217. {{4, 12, 12, 32, 4}, {4, 3, 3, 16, 4}, {4, 1, 1, 1, 4}, {}, {}});
  218. }
  219. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
  220. require_compute_capability(6, 1);
  221. conv_bias::check_conv_bias(
  222. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  223. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  224. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  225. param::ConvBias::Format::CHWN4,
  226. conv_bias::get_int8_chwn4_args_check_bounds(3));
  227. }
  228. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1) {
  229. require_compute_capability(6, 1);
  230. conv_bias::check_conv_bias(
  231. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  232. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  233. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  234. param::ConvBias::Format::CHWN4,
  235. conv_bias::get_int8_chwn4_small_channel_args(1));
  236. }
  237. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_3x3) {
  238. require_compute_capability(6, 1);
  239. conv_bias::check_conv_bias(
  240. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  241. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  242. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  243. param::ConvBias::Format::CHWN4,
  244. conv_bias::get_int8_chwn4_small_channel_args(3));
  245. }
  246. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5) {
  247. require_compute_capability(6, 1);
  248. conv_bias::check_conv_bias(
  249. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  250. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  251. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  252. param::ConvBias::Format::CHWN4,
  253. conv_bias::get_int8_chwn4_small_channel_args(5));
  254. }
  255. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7) {
  256. require_compute_capability(6, 1);
  257. conv_bias::check_conv_bias(
  258. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  259. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  260. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  261. param::ConvBias::Format::CHWN4,
  262. conv_bias::get_int8_chwn4_small_channel_args(7));
  263. }
  264. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_SMALL_CHANNEL_CHECK_BOUNDS) {
  265. require_compute_capability(6, 1);
  266. conv_bias::check_conv_bias(
  267. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  268. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  269. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  270. param::ConvBias::Format::NCHW4,
  271. conv_bias::get_int8_nchw4_small_channel_args_check_bounds(3));
  272. }
  273. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1_CHECK_BOUNDS) {
  274. require_compute_capability(6, 1);
  275. conv_bias::check_conv_bias(
  276. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  277. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  278. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  279. param::ConvBias::Format::CHWN4,
  280. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(1));
  281. }
  282. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5_CHECK_BOUNDS) {
  283. require_compute_capability(6, 1);
  284. conv_bias::check_conv_bias(
  285. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  286. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  287. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  288. param::ConvBias::Format::CHWN4,
  289. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(5));
  290. }
  291. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7_CHECK_BOUNDS) {
  292. require_compute_capability(6, 1);
  293. conv_bias::check_conv_bias(
  294. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  295. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  296. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  297. param::ConvBias::Format::CHWN4,
  298. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(7));
  299. }
  300. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_1x1) {
  301. require_compute_capability(7, 5);
  302. conv_bias::check_conv_bias(
  303. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  304. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  305. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  306. param::ConvBias::Format::NCHW4,
  307. conv_bias::get_int8_nchw4_tensorcore_args(1));
  308. }
  309. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_3x3) {
  310. require_compute_capability(7, 5);
  311. conv_bias::check_conv_bias(
  312. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  313. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  314. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  315. param::ConvBias::Format::NCHW4,
  316. conv_bias::get_int8_nchw4_tensorcore_args(3));
  317. }
  318. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_5x5) {
  319. require_compute_capability(7, 5);
  320. conv_bias::check_conv_bias(
  321. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  322. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  323. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  324. param::ConvBias::Format::NCHW4,
  325. conv_bias::get_int8_nchw4_tensorcore_args(5));
  326. }
  327. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_7x7) {
  328. require_compute_capability(7, 5);
  329. conv_bias::check_conv_bias(
  330. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  331. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  332. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  333. param::ConvBias::Format::NCHW4,
  334. conv_bias::get_int8_nchw4_tensorcore_args(7));
  335. }
  336. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  337. require_compute_capability(7, 5);
  338. conv_bias::check_conv_bias(
  339. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  340. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  341. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  342. param::ConvBias::Format::NCHW4,
  343. conv_bias::get_int8_nchw4_args_check_bounds(3));
  344. }
  345. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  346. require_compute_capability(7, 5);
  347. conv_bias::check_conv_bias(
  348. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  349. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  350. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  351. param::ConvBias::Format::NCHW4,
  352. conv_bias::get_int8_nchw4_args_check_bounds(3));
  353. }
  354. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  355. require_compute_capability(7, 5);
  356. conv_bias::check_conv_bias(
  357. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  358. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  359. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  360. param::ConvBias::Format::NCHW4,
  361. conv_bias::get_int8_nchw4_args_check_bounds(3));
  362. }
  363. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_0) {
  364. require_compute_capability(7, 5);
  365. conv_bias::check_conv_bias(
  366. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  367. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  368. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  369. param::ConvBias::Format::CHWN4,
  370. conv_bias::get_int8_chwn4_tensorcore_args(3));
  371. }
  372. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_1) {
  373. require_compute_capability(7, 5);
  374. conv_bias::check_conv_bias(
  375. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  376. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  377. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  378. param::ConvBias::Format::CHWN4,
  379. conv_bias::get_int8_chwn4_tensorcore_args(3));
  380. }
  381. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_2) {
  382. require_compute_capability(7, 5);
  383. conv_bias::check_conv_bias(
  384. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  385. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  386. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  387. param::ConvBias::Format::CHWN4,
  388. conv_bias::get_int8_chwn4_tensorcore_args(3));
  389. }
  390. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_1x1) {
  391. require_compute_capability(7, 5);
  392. conv_bias::check_conv_bias(
  393. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  394. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  395. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  396. param::ConvBias::Format::CHWN4,
  397. conv_bias::get_int8_chwn4_args_check_bounds(1));
  398. }
  399. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_5x5) {
  400. require_compute_capability(7, 5);
  401. conv_bias::check_conv_bias(
  402. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  403. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  404. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  405. param::ConvBias::Format::CHWN4,
  406. conv_bias::get_int8_chwn4_args_check_bounds(5));
  407. }
  408. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_7x7) {
  409. require_compute_capability(7, 5);
  410. conv_bias::check_conv_bias(
  411. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  412. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  413. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  414. param::ConvBias::Format::CHWN4,
  415. conv_bias::get_int8_chwn4_args_check_bounds(7));
  416. }
  417. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_WITH_Z) {
  418. require_compute_capability(7, 5);
  419. Checker<ConvBiasForward> checker(handle_cuda());
  420. checker.set_before_exec_callback(
  421. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  422. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  423. UniformIntRNG rng{-3, 3};
  424. UniformIntRNG bias_rng{-50, 50};
  425. checker.set_rng(0, &rng)
  426. .set_rng(1, &rng)
  427. .set_rng(2, &bias_rng)
  428. .set_rng(3, &rng)
  429. .set_dtype(0, dtype::QuantizedS8{1.2f})
  430. .set_dtype(1, dtype::QuantizedS8{1.3f})
  431. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  432. .set_dtype(3, dtype::QuantizedS8{1.1f})
  433. .set_dtype(4, dtype::QuantizedS8{1.0f})
  434. .set_epsilon(1 + 1e-3)
  435. .set_max_avg_error(1e-1)
  436. .set_max_avg_biased_error(1e-1);
  437. param::ConvBias param;
  438. param.pad_h = param.pad_w = 1;
  439. param.stride_h = param.stride_w = 1;
  440. param.format = param::ConvBias::Format::NCHW4;
  441. checker.set_param(param).execs({{64, 8, 12, 12, 4},
  442. {64, 8, 3, 3, 4},
  443. {1, 16, 1, 1, 4},
  444. {64, 16, 12, 12, 4},
  445. {}});
  446. }
  447. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_WITH_Z) {
  448. require_compute_capability(7, 5);
  449. Checker<ConvBiasForward> checker(handle_cuda());
  450. checker.set_before_exec_callback(
  451. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  452. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  453. UniformIntRNG rng{-3, 3};
  454. UniformIntRNG bias_rng{-50, 50};
  455. checker.set_rng(0, &rng)
  456. .set_rng(1, &rng)
  457. .set_rng(2, &bias_rng)
  458. .set_rng(3, &rng)
  459. .set_dtype(0, dtype::QuantizedS8{1.2f})
  460. .set_dtype(1, dtype::QuantizedS8{1.3f})
  461. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  462. .set_dtype(3, dtype::QuantizedS8{1.1f})
  463. .set_dtype(4, dtype::QuantizedS8{1.0f})
  464. .set_epsilon(1 + 1e-3)
  465. .set_max_avg_error(1e-1)
  466. .set_max_avg_biased_error(1e-1);
  467. param::ConvBias param;
  468. param.pad_h = param.pad_w = 1;
  469. param.stride_h = param.stride_w = 1;
  470. param.format = param::ConvBias::Format::CHWN4;
  471. checker.set_param(param).execs({{8, 12, 12, 64, 4},
  472. {8, 3, 3, 64, 4},
  473. {16, 1, 1, 1, 4},
  474. {16, 12, 12, 64, 4},
  475. {}});
  476. }
  477. TEST_F(CUDA,
  478. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  479. require_compute_capability(7, 5);
  480. conv_bias::check_conv_bias(
  481. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  482. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  483. handle_cuda(),
  484. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  485. param::ConvBias::Format::CHWN4,
  486. conv_bias::get_int8_chwn4_args_check_bounds(3));
  487. }
  488. TEST_F(CUDA,
  489. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  490. require_compute_capability(7, 5);
  491. conv_bias::check_conv_bias(
  492. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  493. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  494. handle_cuda(),
  495. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  496. param::ConvBias::Format::CHWN4,
  497. conv_bias::get_int8_chwn4_args_check_bounds(3));
  498. }
  499. TEST_F(CUDA,
  500. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  501. require_compute_capability(7, 5);
  502. conv_bias::check_conv_bias(
  503. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  504. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  505. handle_cuda(),
  506. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  507. param::ConvBias::Format::CHWN4,
  508. conv_bias::get_int8_chwn4_args_check_bounds(3));
  509. }
  510. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_0) {
  511. require_compute_capability(7, 5);
  512. conv_bias::check_conv_bias(
  513. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  514. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  515. handle_cuda(),
  516. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  517. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  518. }
  519. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_1) {
  520. require_compute_capability(7, 5);
  521. conv_bias::check_conv_bias(
  522. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  523. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  524. handle_cuda(),
  525. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  526. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  527. }
  528. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_2) {
  529. require_compute_capability(7, 5);
  530. conv_bias::check_conv_bias(
  531. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  532. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  533. handle_cuda(),
  534. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  535. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  536. }
  537. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_0) {
  538. require_compute_capability(7, 5);
  539. conv_bias::check_conv_bias(
  540. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  541. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  542. handle_cuda(),
  543. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  544. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  545. }
  546. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_1) {
  547. require_compute_capability(7, 5);
  548. conv_bias::check_conv_bias(
  549. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  550. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  551. handle_cuda(),
  552. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  553. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  554. }
  555. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_2) {
  556. require_compute_capability(7, 5);
  557. conv_bias::check_conv_bias(
  558. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  559. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  560. handle_cuda(),
  561. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  562. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  563. }
  564. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1) {
  565. require_compute_capability(7, 5);
  566. conv_bias::check_conv_bias(
  567. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  568. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  569. handle_cuda(),
  570. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  571. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(1));
  572. }
  573. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5) {
  574. require_compute_capability(7, 5);
  575. conv_bias::check_conv_bias(
  576. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  577. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  578. handle_cuda(),
  579. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  580. param::ConvBias::Format::CHWN4,
  581. conv_bias::get_int8_chwn4_args_small_batch(5));
  582. }
  583. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_7x7) {
  584. require_compute_capability(7, 5);
  585. conv_bias::check_conv_bias(
  586. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  587. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  588. handle_cuda(),
  589. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  590. param::ConvBias::Format::CHWN4,
  591. conv_bias::get_int8_chwn4_args_small_batch(7));
  592. }
  593. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_1) {
  594. require_compute_capability(7, 5);
  595. conv_bias::check_conv_bias(
  596. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  597. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  598. handle_cuda(),
  599. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  600. param::ConvBias::Format::CHWN4,
  601. conv_bias::get_int8_chwn4_args_small_batch(5));
  602. }
  603. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_2) {
  604. require_compute_capability(7, 5);
  605. conv_bias::check_conv_bias(
  606. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  607. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  608. handle_cuda(),
  609. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  610. param::ConvBias::Format::CHWN4,
  611. conv_bias::get_int8_chwn4_args_small_batch(5));
  612. }
  613. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_1) {
  614. require_compute_capability(7, 5);
  615. conv_bias::check_conv_bias(
  616. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  617. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  618. handle_cuda(),
  619. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  620. param::ConvBias::Format::CHWN4,
  621. conv_bias::get_int8_chwn4_args_small_batch(1));
  622. }
  623. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
  624. require_compute_capability(7, 5);
  625. conv_bias::check_conv_bias(
  626. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  627. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  628. handle_cuda(),
  629. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  630. param::ConvBias::Format::CHWN4,
  631. conv_bias::get_int8_chwn4_args_small_batch(1));
  632. }
  633. TEST_F(CUDA, CUTLASS_INT8_WEIGHT_PREPROCESS) {
  634. require_compute_capability(6, 1);
  635. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  636. handle_cuda());
  637. auto check = [&checker](const std::string& algo) {
  638. checker.set_before_exec_callback(
  639. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  640. UniformIntRNG rng{-16, 16};
  641. UniformIntRNG bias_rng{-50, 50};
  642. UniformIntRNG const_rng{1, 1};
  643. checker.set_rng(0, &rng)
  644. .set_rng(1, &rng)
  645. .set_rng(2, &bias_rng)
  646. .set_rng(3, &rng)
  647. .set_dtype(0, dtype::QuantizedS8{1.2f})
  648. .set_dtype(1, dtype::QuantizedS8{1.3f})
  649. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  650. .set_dtype(3, dtype::QuantizedS8{1.3f})
  651. .set_dtype(4, dtype::QuantizedS8{1.0f})
  652. .set_epsilon(1 + 1e-3)
  653. .set_max_avg_error(1e-1)
  654. .set_max_avg_biased_error(1e-3);
  655. param::ConvBias param;
  656. param.pad_h = param.pad_w = 1;
  657. param.stride_h = param.stride_w = 2;
  658. param.format = param::ConvBias::Format::NCHW4;
  659. checker.set_param(param).execs({{16, 4, 14, 14, 4},
  660. {16, 4, 3, 3, 4},
  661. {1, 4, 1, 1, 4},
  662. {},
  663. {}});
  664. };
  665. check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32X32_64X32X32");
  666. check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16X64X8_16X64X8");
  667. }
  668. #if CUDA_VERSION >= 10020
  669. /// \note: we only check several cases and block sizes in megdnn_test, the
  670. /// full testcases are written in cutlass repository
  671. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
  672. require_compute_capability_eq(7, 5);
  673. Checker<ConvBiasForward> checker(handle_cuda());
  674. auto check = [&checker](const std::string& algo) {
  675. checker.set_before_exec_callback(
  676. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  677. UniformIntRNG rng{-8, 8};
  678. UniformIntRNG bias_rng{-50, 50};
  679. UniformIntRNG const_rng{1, 1};
  680. // use scale that are all integers to avoid rouding error
  681. checker.set_rng(0, &rng)
  682. .set_rng(1, &rng)
  683. .set_rng(2, &bias_rng)
  684. .set_rng(3, &rng)
  685. .set_dtype(0, dtype::QuantizedS8{6.0f})
  686. .set_dtype(1, dtype::QuantizedS8{1.0f})
  687. .set_dtype(2, dtype::QuantizedS32{6.0f})
  688. .set_dtype(3, dtype::QuantizedS8{1.0f})
  689. .set_dtype(4, dtype::QuantizedS8{6.0f})
  690. .set_epsilon(1e-3);
  691. param::ConvBias param;
  692. param.pad_h = param.pad_w = 1;
  693. param.stride_h = param.stride_w = 1;
  694. param.format = param::ConvBias::Format::NCHW32;
  695. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  696. {512, 16, 3, 3, 32},
  697. {1, 16, 1, 1, 32},
  698. {},
  699. {}});
  700. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  701. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  702. {512, 16, 1, 1, 32},
  703. {1, 16, 1, 1, 32},
  704. {},
  705. {}});
  706. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  707. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  708. {512, 16, 3, 3, 32},
  709. {1, 16, 1, 1, 32},
  710. {},
  711. {}});
  712. // use non integer scale
  713. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  714. checker.set_dtype(0, dtype::QuantizedS8{1.1f})
  715. .set_dtype(1, dtype::QuantizedS8{1.2f})
  716. .set_dtype(2, dtype::QuantizedS32{1.1f * 1.2f})
  717. .set_dtype(3, dtype::QuantizedS8{1.1f})
  718. .set_dtype(4, dtype::QuantizedS8{6.0f})
  719. .set_epsilon(1 + 1e-3)
  720. .set_max_avg_error(1e-1)
  721. .set_max_avg_biased_error(1e-1)
  722. .execs({{16, 16, 7, 7, 32},
  723. {512, 16, 3, 3, 32},
  724. {1, 16, 1, 1, 32},
  725. {16, 16, 7, 7, 32},
  726. {}});
  727. };
  728. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  729. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_256X128X64_64X64X64",
  730. ConvBias::DirectParam{});
  731. check(algo);
  732. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  733. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X64X64_32X16X64",
  734. ConvBias::DirectParam{});
  735. check(algo);
  736. }
  737. #endif
  738. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) {
  739. require_compute_capability(6, 1);
  740. using namespace conv_bias;
  741. Checker<ConvBiasForward> checker(handle_cuda());
  742. UniformIntRNG int_rng{-3, 3};
  743. UniformFloatRNG float_rng{-50, 50};
  744. ConvBias::Param param;
  745. param.format = ConvBias::Param::Format::NCHW4_NCHW;
  746. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  747. checker.set_before_exec_callback(
  748. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  749. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  750. checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  751. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  752. .set_dtype(2, dtype::Float32())
  753. .set_dtype(3, dtype::Float32())
  754. .set_dtype(4, dtype::Float32())
  755. .set_rng(0, &int_rng)
  756. .set_rng(1, &int_rng)
  757. .set_rng(2, &float_rng)
  758. .set_rng(3, &float_rng)
  759. .set_param(param);
  760. auto opr = handle_cuda()->create_operator<ConvBias>();
  761. auto run = [&](const TensorShapeArray& shapes) {
  762. opr->param() = param;
  763. TensorLayout dst_layout;
  764. opr->deduce_layout({shapes[0], dtype::Float32()},
  765. {shapes[1], dtype::Float32()}, {}, {}, dst_layout);
  766. checker.execs({shapes[0], shapes[1], shapes[2], dst_layout, {}});
  767. };
  768. run({{16, 4, 23, 40, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}});
  769. run({{16, 4, 92, 160, 4}, {24, 4, 3, 3, 4}, {1, 24, 1, 1}});
  770. run({{16, 4, 92, 160, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}});
  771. run({{16, 4, 92, 160, 4}, {16, 4, 3, 3, 4}, {1, 16, 1, 1}});
  772. run({{16, 4, 92, 160, 4}, {8, 4, 3, 3, 4}, {1, 8, 1, 1}});
  773. run({{16, 4, 46, 80, 4}, {4, 4, 3, 3, 4}, {1, 4, 1, 1}});
  774. }
  775. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW32) {
  776. require_compute_capability(6, 1);
  777. using namespace conv_bias;
  778. Checker<ConvBiasForward> checker(handle_cuda());
  779. UniformIntRNG int_rng{-3, 3};
  780. UniformIntRNG bias_rng{-50, 50};
  781. ConvBias::Param param;
  782. param.format = ConvBias::Param::Format::NCHW4_NCHW32;
  783. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  784. checker.set_before_exec_callback(
  785. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  786. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  787. checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  788. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  789. .set_dtype(2, dtype::QuantizedS32(1.9980618f * 1.9980927f))
  790. .set_dtype(3, dtype::QuantizedS8(1.9980618f))
  791. .set_dtype(4, dtype::QuantizedS8(1.9980618f))
  792. .set_rng(0, &int_rng)
  793. .set_rng(1, &int_rng)
  794. .set_rng(2, &bias_rng)
  795. .set_rng(3, &int_rng)
  796. .set_param(param);
  797. auto run = [&](const TensorShapeArray& shapes) {
  798. checker.execs({shapes[0], shapes[1], shapes[2], {}, {}});
  799. };
  800. run({{16, 4, 23, 40, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}});
  801. run({{16, 4, 92, 160, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}});
  802. run({{16, 4, 46, 80, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}});
  803. }
  804. #if CUDA_VERSION >= 10020
  805. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) {
  806. require_compute_capability(7, 5);
  807. using namespace conv_bias;
  808. Checker<ConvBiasForward> checker(handle_cuda());
  809. UniformIntRNG int_rng{-3, 3};
  810. UniformIntRNG bias_rng{-50, 50};
  811. ConvBias::Param param;
  812. param.format = ConvBias::Param::Format::NCHW32_NCHW4;
  813. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  814. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<
  815. ConvBiasForward>(
  816. ConvBias::algo_name<ConvBias::DirectParam>(
  817. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_256X128X64_64X64X64",
  818. ConvBias::DirectParam{})
  819. .c_str()));
  820. checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  821. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  822. .set_dtype(2, dtype::QuantizedS32(1.9980618f * 1.9980927f))
  823. .set_dtype(3, dtype::QuantizedS8(1.9980618f))
  824. .set_dtype(4, dtype::QuantizedS8(1.9980618f))
  825. .set_rng(0, &int_rng)
  826. .set_rng(1, &int_rng)
  827. .set_rng(2, &bias_rng)
  828. .set_rng(3, &int_rng)
  829. .set_param(param);
  830. auto run = [&](const TensorShapeArray& shapes) {
  831. checker.execs({shapes[0], shapes[1], shapes[2], {}, {}});
  832. };
  833. run({{16, 2, 23, 40, 32}, {20, 2, 3, 3, 32}, {1, 5, 1, 1, 4}});
  834. run({{16, 1, 92, 160, 32}, {24, 1, 3, 3, 32}, {1, 6, 1, 1, 4}});
  835. run({{16, 2, 46, 80, 32}, {4, 2, 3, 3, 32}, {1, 1, 1, 1, 4}});
  836. }
  837. #endif
  838. #if MEGDNN_WITH_BENCHMARK
  839. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) {
  840. require_compute_capability(6, 1);
  841. benchmark_target_algo(
  842. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  843. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  844. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  845. param::ConvBias::Format::CHWN4);
  846. }
  847. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4) {
  848. require_compute_capability(6, 1);
  849. benchmark_target_algo(
  850. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  851. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  852. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  853. param::ConvBias::Format::NCHW4);
  854. }
  855. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE) {
  856. require_compute_capability(7, 5);
  857. benchmark_target_algo_with_cudnn_tsc(
  858. handle_cuda(), get_resnet50_bench_args(256),
  859. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  860. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  861. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  862. param::ConvBias::Format::CHWN4);
  863. }
  864. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE_ALL_ALGO) {
  865. require_compute_capability(7, 5);
  866. benchmark_target_algo_with_cudnn_tsc(
  867. handle_cuda(), get_resnet50_bench_args(256),
  868. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  869. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, nullptr,
  870. param::ConvBias::Format::CHWN4);
  871. }
  872. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_DET_ALL_ALGO) {
  873. require_compute_capability(7, 5);
  874. benchmark_target_algo_with_cudnn_tsc(
  875. handle_cuda(), get_detection_bench_args(), dtype::QuantizedS8{1.2f},
  876. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  877. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  878. }
  879. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_TENSORCORE) {
  880. require_compute_capability(7, 5);
  881. benchmark_target_algo_with_cudnn_tsc(
  882. handle_cuda(), get_resnet50_bench_args(256),
  883. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  884. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  885. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  886. param::ConvBias::Format::NCHW4);
  887. }
  888. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
  889. require_compute_capability(6, 1);
  890. std::vector<BenchArgs> args;
  891. args.push_back(BenchArgs{64, 4, 224, 224, 64, 7, 2});
  892. benchmark_target_algo(
  893. handle_cuda(), args, dtype::QuantizedS8{1.2f},
  894. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  895. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  896. param::ConvBias::Format::CHWN4);
  897. }
  898. #if CUDA_VERSION >= 10020
  899. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
  900. require_compute_capability(7, 5);
  901. benchmark_target_algo_with_cudnn_tsc(
  902. handle_cuda(), get_resnet50_bench_args(256),
  903. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  904. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  905. "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
  906. param::ConvBias::Format::NCHW32);
  907. }
  908. #endif
  909. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
  910. require_compute_capability(6, 1);
  911. benchmark_target_algo(
  912. handle_cuda(), get_resnet50_bench_args(64),
  913. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  914. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  915. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4);
  916. }
  917. TEST_F(CUDA, BENCHMARK_SASS_CONV_BIAS_INT8_NCHW4_DET_FIRST) {
  918. require_compute_capability(6, 1);
  919. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  920. "SASS_INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32_64",
  921. ConvBias::DirectParam{});
  922. benchmark_target_algo(handle_cuda(), get_det_first_bench_args(16),
  923. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  924. dtype::QuantizedS32{1.2f * 1.3f},
  925. dtype::QuantizedS8{1.0f}, algo.c_str(),
  926. param::ConvBias::Format::NCHW4);
  927. }
  928. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4_DET_FIRST) {
  929. require_compute_capability(6, 1);
  930. benchmark_target_algo(
  931. handle_cuda(), get_det_first_bench_args(16),
  932. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  933. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  934. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16", param::ConvBias::Format::NCHW4);
  935. }
  936. #endif
  937. }
  938. } // namespace test
  939. } // namespace megdnn
  940. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台