You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_int8.cpp 53 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190
  1. /**
  2. * \file dnn/test/cuda/conv_bias_int8.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/oprs/nn.h"
  13. #include "src/common/utils.h"
  14. #include "src/cuda/cudnn_with_check.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/conv_bias.h"
  17. #include "test/common/tensor.h"
  18. #include "test/common/workspace_wrapper.h"
  19. #include "test/cuda/benchmark.h"
  20. #include "test/cuda/conv_test_utils.h"
  21. #include "test/cuda/fixture.h"
  22. #include "test/cuda/utils.h"
  23. namespace megdnn {
  24. namespace test {
  25. namespace conv {
  26. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CUDNN_CONVOLUTION) {
  27. require_compute_capability(7, 5);
  28. conv_bias::check_conv_bias(
  29. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  30. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  31. "DEFAULT:CUDNN:ConvBiasActivation:", param::ConvBias::Format::NCHW4);
  32. }
  33. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) {
  34. require_compute_capability(6, 1);
  35. conv_bias::check_conv_bias(
  36. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  37. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  38. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  39. conv_bias::get_int8_nchw4_args(1));
  40. }
  41. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_3x3) {
  42. require_compute_capability(6, 1);
  43. conv_bias::check_conv_bias(
  44. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  45. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  46. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4);
  47. }
  48. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_5x5) {
  49. require_compute_capability(6, 1);
  50. conv_bias::check_conv_bias(
  51. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  52. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  53. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  54. conv_bias::get_int8_nchw4_args(5));
  55. }
  56. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_7x7) {
  57. require_compute_capability(6, 1);
  58. conv_bias::check_conv_bias(
  59. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  60. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  61. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  62. conv_bias::get_int8_nchw4_args(7));
  63. }
  64. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_WITH_Z) {
  65. require_compute_capability(6, 1);
  66. Checker<ConvBiasForward> checker(handle_cuda());
  67. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  68. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  69. UniformIntRNG rng{-3, 3};
  70. UniformIntRNG bias_rng{-50, 50};
  71. checker.set_rng(0, &rng)
  72. .set_rng(1, &rng)
  73. .set_rng(2, &bias_rng)
  74. .set_rng(3, &rng)
  75. .set_dtype(0, dtype::QuantizedS8{1.2f})
  76. .set_dtype(1, dtype::QuantizedS8{1.3f})
  77. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  78. .set_dtype(3, dtype::QuantizedS8{1.1f})
  79. .set_dtype(4, dtype::QuantizedS8{1.0f})
  80. .set_epsilon(1 + 1e-3)
  81. .set_max_avg_error(1e-1)
  82. .set_max_avg_biased_error(1e-1);
  83. param::ConvBias param;
  84. param.pad_h = param.pad_w = 1;
  85. param.stride_h = param.stride_w = 1;
  86. param.format = param::ConvBias::Format::NCHW4;
  87. checker.set_param(param).execs(
  88. {{32, 4, 12, 12, 4},
  89. {16, 4, 3, 3, 4},
  90. {1, 4, 1, 1, 4},
  91. {32, 4, 12, 12, 4},
  92. {}});
  93. }
  94. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_STRIDE2_WITH_Z) {
  95. require_compute_capability(6, 1);
  96. Checker<ConvBiasForward> checker(handle_cuda());
  97. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  98. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  99. UniformIntRNG rng{-3, 3};
  100. UniformIntRNG bias_rng{-50, 50};
  101. checker.set_rng(0, &rng)
  102. .set_rng(1, &rng)
  103. .set_rng(2, &bias_rng)
  104. .set_rng(3, &rng)
  105. .set_dtype(0, dtype::QuantizedS8{1.2f})
  106. .set_dtype(1, dtype::QuantizedS8{1.3f})
  107. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  108. .set_dtype(3, dtype::QuantizedS8{1.1f})
  109. .set_dtype(4, dtype::QuantizedS8{1.0f})
  110. .set_epsilon(1 + 1e-3)
  111. .set_max_avg_error(1e-1)
  112. .set_max_avg_biased_error(1e-1);
  113. param::ConvBias param;
  114. param.pad_h = param.pad_w = 1;
  115. param.stride_h = param.stride_w = 2;
  116. param.format = param::ConvBias::Format::NCHW4;
  117. checker.set_param(param).execs(
  118. {{32, 4, 12, 12, 4},
  119. {16, 4, 3, 3, 4},
  120. {1, 4, 1, 1, 4},
  121. {32, 4, 6, 6, 4},
  122. {}});
  123. }
  124. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_1x1) {
  125. require_compute_capability(6, 1);
  126. conv_bias::check_conv_bias(
  127. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  128. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  129. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  130. conv_bias::get_int8_nchw4_args_check_bounds(1));
  131. }
  132. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_3x3) {
  133. require_compute_capability(6, 1);
  134. conv_bias::check_conv_bias(
  135. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  136. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  137. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  138. conv_bias::get_int8_nchw4_args_check_bounds(3));
  139. }
  140. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_5x5) {
  141. require_compute_capability(6, 1);
  142. conv_bias::check_conv_bias(
  143. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  144. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  145. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  146. conv_bias::get_int8_nchw4_args_check_bounds(5));
  147. }
  148. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_7x7) {
  149. require_compute_capability(6, 1);
  150. conv_bias::check_conv_bias(
  151. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  152. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  153. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  154. conv_bias::get_int8_nchw4_args_check_bounds(7));
  155. }
  156. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4) {
  157. require_compute_capability(6, 1);
  158. conv_bias::check_conv_bias(
  159. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  160. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  161. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4);
  162. }
  163. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_WITH_Z) {
  164. require_compute_capability(6, 1);
  165. Checker<ConvBiasForward> checker(handle_cuda());
  166. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  167. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  168. UniformIntRNG rng{-3, 3};
  169. UniformIntRNG bias_rng{-50, 50};
  170. checker.set_rng(0, &rng)
  171. .set_rng(1, &rng)
  172. .set_rng(2, &bias_rng)
  173. .set_rng(3, &rng)
  174. .set_dtype(0, dtype::QuantizedS8{1.2f})
  175. .set_dtype(1, dtype::QuantizedS8{1.3f})
  176. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  177. .set_dtype(3, dtype::QuantizedS8{1.1f})
  178. .set_dtype(4, dtype::QuantizedS8{1.1f})
  179. .set_epsilon(1 + 1e-3)
  180. .set_max_avg_error(1e-1)
  181. .set_max_avg_biased_error(1e-1);
  182. param::ConvBias param;
  183. param.pad_h = param.pad_w = 1;
  184. param.stride_h = param.stride_w = 1;
  185. param.format = param::ConvBias::Format::CHWN4;
  186. checker.set_param(param).execs(
  187. {{4, 12, 12, 32, 4},
  188. {4, 3, 3, 16, 4},
  189. {4, 1, 1, 1, 4},
  190. {4, 12, 12, 32, 4},
  191. {}});
  192. }
  193. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
  194. require_compute_capability(6, 1);
  195. Checker<ConvBiasForward> checker(handle_cuda());
  196. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  197. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  198. UniformIntRNG rng{-3, 3};
  199. UniformIntRNG bias_rng{-50, 50};
  200. checker.set_rng(0, &rng)
  201. .set_rng(1, &rng)
  202. .set_rng(2, &bias_rng)
  203. .set_rng(3, &rng)
  204. .set_dtype(0, dtype::QuantizedS8{1.2f})
  205. .set_dtype(1, dtype::QuantizedS8{1.3f})
  206. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  207. .set_dtype(4, dtype::QuantizedS8{0.001f})
  208. .set_epsilon(1 + 1e-3)
  209. .set_max_avg_error(1e-1)
  210. .set_max_avg_biased_error(1e-1);
  211. param::ConvBias param;
  212. param.pad_h = param.pad_w = 1;
  213. param.stride_h = param.stride_w = 1;
  214. param.format = param::ConvBias::Format::CHWN4;
  215. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  216. checker.set_param(param).execs(
  217. {{4, 12, 12, 32, 4}, {4, 3, 3, 16, 4}, {4, 1, 1, 1, 4}, {}, {}});
  218. }
  219. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
  220. require_compute_capability(6, 1);
  221. conv_bias::check_conv_bias(
  222. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  223. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  224. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  225. conv_bias::get_int8_chwn4_args_check_bounds(3));
  226. }
  227. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1) {
  228. require_compute_capability(6, 1);
  229. conv_bias::check_conv_bias(
  230. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  231. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  232. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  233. conv_bias::get_int8_chwn4_small_channel_args(1));
  234. }
  235. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_3x3) {
  236. require_compute_capability(6, 1);
  237. conv_bias::check_conv_bias(
  238. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  239. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  240. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  241. conv_bias::get_int8_chwn4_small_channel_args(3));
  242. }
  243. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5) {
  244. require_compute_capability(6, 1);
  245. conv_bias::check_conv_bias(
  246. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  247. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  248. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  249. conv_bias::get_int8_chwn4_small_channel_args(5));
  250. }
  251. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7) {
  252. require_compute_capability(6, 1);
  253. conv_bias::check_conv_bias(
  254. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  255. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  256. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  257. conv_bias::get_int8_chwn4_small_channel_args(7));
  258. }
  259. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_SMALL_CHANNEL_CHECK_BOUNDS) {
  260. require_compute_capability(6, 1);
  261. conv_bias::check_conv_bias(
  262. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  263. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  264. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4,
  265. conv_bias::get_int8_nchw4_small_channel_args_check_bounds(3));
  266. }
  267. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1_CHECK_BOUNDS) {
  268. require_compute_capability(6, 1);
  269. conv_bias::check_conv_bias(
  270. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  271. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  272. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  273. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(1));
  274. }
  275. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5_CHECK_BOUNDS) {
  276. require_compute_capability(6, 1);
  277. conv_bias::check_conv_bias(
  278. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  279. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  280. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  281. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(5));
  282. }
  283. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7_CHECK_BOUNDS) {
  284. require_compute_capability(6, 1);
  285. conv_bias::check_conv_bias(
  286. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  287. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  288. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4,
  289. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(7));
  290. }
  291. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_1x1) {
  292. require_compute_capability(7, 5);
  293. conv_bias::check_conv_bias(
  294. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  295. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  296. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::NCHW4,
  297. conv_bias::get_int8_nchw4_tensorcore_args(1));
  298. }
  299. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_3x3) {
  300. require_compute_capability(7, 5);
  301. conv_bias::check_conv_bias(
  302. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  303. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  304. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::NCHW4,
  305. conv_bias::get_int8_nchw4_tensorcore_args(3));
  306. }
  307. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_5x5) {
  308. require_compute_capability(7, 5);
  309. conv_bias::check_conv_bias(
  310. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  311. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  312. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::NCHW4,
  313. conv_bias::get_int8_nchw4_tensorcore_args(5));
  314. }
  315. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_7x7) {
  316. require_compute_capability(7, 5);
  317. conv_bias::check_conv_bias(
  318. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  319. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  320. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::NCHW4,
  321. conv_bias::get_int8_nchw4_tensorcore_args(7));
  322. }
  323. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  324. require_compute_capability(7, 5);
  325. conv_bias::check_conv_bias(
  326. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  327. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  328. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::NCHW4,
  329. conv_bias::get_int8_nchw4_args_check_bounds(3));
  330. }
  331. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  332. require_compute_capability(7, 5);
  333. conv_bias::check_conv_bias(
  334. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  335. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  336. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma8x32x16", param::ConvBias::Format::NCHW4,
  337. conv_bias::get_int8_nchw4_args_check_bounds(3));
  338. }
  339. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  340. require_compute_capability(7, 5);
  341. conv_bias::check_conv_bias(
  342. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  343. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  344. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma32x8x16", param::ConvBias::Format::NCHW4,
  345. conv_bias::get_int8_nchw4_args_check_bounds(3));
  346. }
  347. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_0) {
  348. require_compute_capability(7, 5);
  349. conv_bias::check_conv_bias(
  350. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  351. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  352. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::CHWN4,
  353. conv_bias::get_int8_chwn4_tensorcore_args(3));
  354. }
  355. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_1) {
  356. require_compute_capability(7, 5);
  357. conv_bias::check_conv_bias(
  358. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  359. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  360. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma32x8x16", param::ConvBias::Format::CHWN4,
  361. conv_bias::get_int8_chwn4_tensorcore_args(3));
  362. }
  363. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_2) {
  364. require_compute_capability(7, 5);
  365. conv_bias::check_conv_bias(
  366. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  367. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  368. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma8x32x16", param::ConvBias::Format::CHWN4,
  369. conv_bias::get_int8_chwn4_tensorcore_args(3));
  370. }
  371. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_1x1) {
  372. require_compute_capability(7, 5);
  373. conv_bias::check_conv_bias(
  374. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  375. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  376. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::CHWN4,
  377. conv_bias::get_int8_chwn4_args_check_bounds(1));
  378. }
  379. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_5x5) {
  380. require_compute_capability(7, 5);
  381. conv_bias::check_conv_bias(
  382. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  383. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  384. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::CHWN4,
  385. conv_bias::get_int8_chwn4_args_check_bounds(5));
  386. }
  387. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_7x7) {
  388. require_compute_capability(7, 5);
  389. conv_bias::check_conv_bias(
  390. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  391. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  392. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16", param::ConvBias::Format::CHWN4,
  393. conv_bias::get_int8_chwn4_args_check_bounds(7));
  394. }
  395. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_WITH_Z) {
  396. require_compute_capability(7, 5);
  397. Checker<ConvBiasForward> checker(handle_cuda());
  398. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  399. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  400. UniformIntRNG rng{-3, 3};
  401. UniformIntRNG bias_rng{-50, 50};
  402. checker.set_rng(0, &rng)
  403. .set_rng(1, &rng)
  404. .set_rng(2, &bias_rng)
  405. .set_rng(3, &rng)
  406. .set_dtype(0, dtype::QuantizedS8{1.2f})
  407. .set_dtype(1, dtype::QuantizedS8{1.3f})
  408. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  409. .set_dtype(3, dtype::QuantizedS8{1.1f})
  410. .set_dtype(4, dtype::QuantizedS8{1.0f})
  411. .set_epsilon(1 + 1e-3)
  412. .set_max_avg_error(1e-1)
  413. .set_max_avg_biased_error(1e-1);
  414. param::ConvBias param;
  415. param.pad_h = param.pad_w = 1;
  416. param.stride_h = param.stride_w = 1;
  417. param.format = param::ConvBias::Format::NCHW4;
  418. checker.set_param(param).execs(
  419. {{64, 8, 12, 12, 4},
  420. {64, 8, 3, 3, 4},
  421. {1, 16, 1, 1, 4},
  422. {64, 16, 12, 12, 4},
  423. {}});
  424. }
  425. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_WITH_Z) {
  426. require_compute_capability(7, 5);
  427. Checker<ConvBiasForward> checker(handle_cuda());
  428. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  429. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  430. UniformIntRNG rng{-3, 3};
  431. UniformIntRNG bias_rng{-50, 50};
  432. checker.set_rng(0, &rng)
  433. .set_rng(1, &rng)
  434. .set_rng(2, &bias_rng)
  435. .set_rng(3, &rng)
  436. .set_dtype(0, dtype::QuantizedS8{1.2f})
  437. .set_dtype(1, dtype::QuantizedS8{1.3f})
  438. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  439. .set_dtype(3, dtype::QuantizedS8{1.1f})
  440. .set_dtype(4, dtype::QuantizedS8{1.0f})
  441. .set_epsilon(1 + 1e-3)
  442. .set_max_avg_error(1e-1)
  443. .set_max_avg_biased_error(1e-1);
  444. param::ConvBias param;
  445. param.pad_h = param.pad_w = 1;
  446. param.stride_h = param.stride_w = 1;
  447. param.format = param::ConvBias::Format::CHWN4;
  448. checker.set_param(param).execs(
  449. {{8, 12, 12, 64, 4},
  450. {8, 3, 3, 64, 4},
  451. {16, 1, 1, 1, 4},
  452. {16, 12, 12, 64, 4},
  453. {}});
  454. }
  455. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  456. require_compute_capability(7, 5);
  457. conv_bias::check_conv_bias(
  458. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  459. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  460. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  461. param::ConvBias::Format::CHWN4,
  462. conv_bias::get_int8_chwn4_args_check_bounds(3));
  463. }
  464. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  465. require_compute_capability(7, 5);
  466. conv_bias::check_conv_bias(
  467. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  468. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  469. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  470. param::ConvBias::Format::CHWN4,
  471. conv_bias::get_int8_chwn4_args_check_bounds(3));
  472. }
  473. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  474. require_compute_capability(7, 5);
  475. conv_bias::check_conv_bias(
  476. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  477. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  478. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  479. param::ConvBias::Format::CHWN4,
  480. conv_bias::get_int8_chwn4_args_check_bounds(3));
  481. }
  482. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_0) {
  483. require_compute_capability(7, 5);
  484. conv_bias::check_conv_bias(
  485. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  486. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  487. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  488. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  489. }
  490. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_1) {
  491. require_compute_capability(7, 5);
  492. conv_bias::check_conv_bias(
  493. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  494. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  495. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  496. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  497. }
  498. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_2) {
  499. require_compute_capability(7, 5);
  500. conv_bias::check_conv_bias(
  501. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  502. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  503. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  504. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  505. }
  506. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_0) {
  507. require_compute_capability(7, 5);
  508. conv_bias::check_conv_bias(
  509. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  510. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  511. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  512. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  513. }
  514. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_1) {
  515. require_compute_capability(7, 5);
  516. conv_bias::check_conv_bias(
  517. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  518. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  519. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  520. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  521. }
  522. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_2) {
  523. require_compute_capability(7, 5);
  524. conv_bias::check_conv_bias(
  525. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  526. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f}, handle_cuda(),
  527. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  528. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  529. }
  530. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1) {
  531. require_compute_capability(7, 5);
  532. conv_bias::check_conv_bias(
  533. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  534. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  535. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  536. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(1));
  537. }
  538. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5) {
  539. require_compute_capability(7, 5);
  540. conv_bias::check_conv_bias(
  541. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  542. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  543. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  544. param::ConvBias::Format::CHWN4,
  545. conv_bias::get_int8_chwn4_args_small_batch(5));
  546. }
  547. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_7x7) {
  548. require_compute_capability(7, 5);
  549. conv_bias::check_conv_bias(
  550. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  551. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  552. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  553. param::ConvBias::Format::CHWN4,
  554. conv_bias::get_int8_chwn4_args_small_batch(7));
  555. }
  556. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_1) {
  557. require_compute_capability(7, 5);
  558. conv_bias::check_conv_bias(
  559. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  560. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  561. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  562. param::ConvBias::Format::CHWN4,
  563. conv_bias::get_int8_chwn4_args_small_batch(5));
  564. }
  565. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_2) {
  566. require_compute_capability(7, 5);
  567. conv_bias::check_conv_bias(
  568. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  569. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  570. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  571. param::ConvBias::Format::CHWN4,
  572. conv_bias::get_int8_chwn4_args_small_batch(5));
  573. }
  574. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_1) {
  575. require_compute_capability(7, 5);
  576. conv_bias::check_conv_bias(
  577. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  578. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  579. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  580. param::ConvBias::Format::CHWN4,
  581. conv_bias::get_int8_chwn4_args_small_batch(1));
  582. }
  583. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
  584. require_compute_capability(7, 5);
  585. conv_bias::check_conv_bias(
  586. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  587. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f}, handle_cuda(),
  588. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  589. param::ConvBias::Format::CHWN4,
  590. conv_bias::get_int8_chwn4_args_small_batch(1));
  591. }
  592. TEST_F(CUDA, FALLBACK_CONV_QS8) {
  593. require_compute_capability_eq(7, 5);
  594. Checker<ConvBiasForward> checker(handle_cuda());
  595. auto check = [&checker](const std::string&& algo, const std::string&& sub_algo) {
  596. checker.set_before_exec_callback(
  597. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  598. {algo.c_str(), {sub_algo.c_str()}}));
  599. UniformIntRNG rng{-3, 3};
  600. UniformIntRNG bias_rng{-50, 50};
  601. checker.set_rng(0, &rng)
  602. .set_rng(1, &rng)
  603. .set_rng(2, &bias_rng)
  604. .set_rng(3, &rng)
  605. .set_dtype(0, dtype::QuantizedS8{1.2f})
  606. .set_dtype(1, dtype::QuantizedS8{1.3f})
  607. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  608. .set_dtype(3, dtype::QuantizedS8{19.990229f})
  609. .set_dtype(4, dtype::QuantizedS8{19.990228f})
  610. .set_epsilon(1e-3)
  611. .set_max_avg_error(1e-1)
  612. .set_max_avg_biased_error(1e-3);
  613. param::ConvBias param;
  614. param.pad_h = param.pad_w = 1;
  615. param.stride_h = param.stride_w = 2;
  616. param.format = param::ConvBias::Format::NCHW;
  617. checker.set_param(param).execs(
  618. {{16, 15, 14, 14}, {28, 15, 3, 3}, {1, 28, 1, 1}, {16, 28, 7, 7}, {}});
  619. checker.set_param(param).execs(
  620. {{16, 32, 14, 14}, {32, 32, 3, 3}, {1, 32, 1, 1}, {}, {}});
  621. };
  622. check("FALLBACK_CONV_NCHW_QS8", "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM");
  623. }
  624. TEST_F(CUDA, FALLBACK_CONV_QS8_F32) {
  625. require_compute_capability_eq(7, 5);
  626. Checker<ConvBiasForward> checker(handle_cuda());
  627. auto check = [&checker](const std::string&& algo, const std::string&& sub_algo) {
  628. checker.set_before_exec_callback(
  629. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  630. {algo.c_str(), {sub_algo.c_str()}}));
  631. UniformIntRNG rng{-3, 3};
  632. UniformFloatRNG bias_rng{-50.f, 50.f};
  633. checker.set_rng(0, &rng)
  634. .set_rng(1, &rng)
  635. .set_rng(2, &bias_rng)
  636. .set_rng(3, &rng)
  637. .set_dtype(0, dtype::QuantizedS8{1.2f})
  638. .set_dtype(1, dtype::QuantizedS8{1.3f})
  639. .set_dtype(2, dtype::Float32{})
  640. .set_dtype(3, dtype::Float32{})
  641. .set_dtype(4, dtype::Float32{})
  642. .set_epsilon(1e-3)
  643. .set_max_avg_error(1e-1)
  644. .set_max_avg_biased_error(1e-3);
  645. param::ConvBias param;
  646. param.pad_h = param.pad_w = 1;
  647. param.stride_h = param.stride_w = 2;
  648. param.format = param::ConvBias::Format::NCHW;
  649. checker.set_param(param).execs(
  650. {{16, 15, 14, 14}, {28, 15, 3, 3}, {1, 28, 1, 1}, {16, 28, 7, 7}, {}});
  651. checker.set_param(param).execs(
  652. {{16, 32, 14, 14}, {32, 32, 3, 3}, {1, 32, 1, 1}, {}, {}});
  653. };
  654. check("FALLBACK_CONV_NCHW_QS8", "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM");
  655. }
  656. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_WEIGHT_PREPROCESS) {
  657. require_compute_capability(6, 1);
  658. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  659. handle_cuda());
  660. auto check = [&checker](const std::string& algo) {
  661. checker.set_before_exec_callback(
  662. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  663. UniformIntRNG rng{-16, 16};
  664. UniformIntRNG bias_rng{-50, 50};
  665. UniformIntRNG const_rng{1, 1};
  666. checker.set_rng(0, &rng)
  667. .set_rng(1, &rng)
  668. .set_rng(2, &bias_rng)
  669. .set_rng(3, &rng)
  670. .set_dtype(0, dtype::QuantizedS8{1.2f})
  671. .set_dtype(1, dtype::QuantizedS8{1.3f})
  672. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  673. .set_dtype(3, dtype::QuantizedS8{1.3f})
  674. .set_dtype(4, dtype::QuantizedS8{1.0f})
  675. .set_epsilon(1 + 1e-3)
  676. .set_max_avg_error(1e-1)
  677. .set_max_avg_biased_error(1e-3);
  678. param::ConvBias param;
  679. param.pad_h = param.pad_w = 1;
  680. param.stride_h = param.stride_w = 2;
  681. param.format = param::ConvBias::Format::NCHW4;
  682. checker.set_param(param).execs(
  683. {{16, 4, 14, 14, 4}, {16, 4, 3, 3, 4}, {1, 4, 1, 1, 4}, {}, {}});
  684. };
  685. check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32X32_64X32X32");
  686. check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16X64X8_16X64X8");
  687. }
  688. #if CUDA_VERSION >= 10020
  689. /// \note: we only check several cases and block sizes in megdnn_test, the
  690. /// full testcases are written in cutlass repository
  691. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
  692. require_compute_capability_eq(7, 5);
  693. Checker<ConvBiasForward> checker(handle_cuda());
  694. auto check = [&checker](const std::string& algo) {
  695. checker.set_before_exec_callback(
  696. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  697. UniformIntRNG rng{-8, 8};
  698. UniformIntRNG bias_rng{-50, 50};
  699. UniformIntRNG const_rng{1, 1};
  700. // use scale that are all integers to avoid rouding error
  701. checker.set_rng(0, &rng)
  702. .set_rng(1, &rng)
  703. .set_rng(2, &bias_rng)
  704. .set_rng(3, &rng)
  705. .set_dtype(0, dtype::QuantizedS8{6.0f})
  706. .set_dtype(1, dtype::QuantizedS8{1.0f})
  707. .set_dtype(2, dtype::QuantizedS32{6.0f})
  708. .set_dtype(3, dtype::QuantizedS8{1.0f})
  709. .set_dtype(4, dtype::QuantizedS8{6.0f})
  710. .set_epsilon(1e-3);
  711. param::ConvBias param;
  712. param.pad_h = param.pad_w = 1;
  713. param.stride_h = param.stride_w = 1;
  714. param.format = param::ConvBias::Format::NCHW32;
  715. checker.set_param(param).execs(
  716. {{16, 8, 7, 7, 32}, {256, 8, 3, 3, 32}, {1, 8, 1, 1, 32}, {}, {}});
  717. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  718. checker.set_param(param).execs(
  719. {{16, 8, 7, 7, 32}, {256, 8, 1, 1, 32}, {1, 8, 1, 1, 32}, {}, {}});
  720. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  721. checker.set_param(param).execs(
  722. {{16, 8, 7, 7, 32}, {256, 8, 3, 3, 32}, {1, 8, 1, 1, 32}, {}, {}});
  723. // use non integer scale
  724. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  725. checker.set_dtype(0, dtype::QuantizedS8{1.1f})
  726. .set_dtype(1, dtype::QuantizedS8{1.2f})
  727. .set_dtype(2, dtype::QuantizedS32{1.1f * 1.2f})
  728. .set_dtype(3, dtype::QuantizedS8{1.1f})
  729. .set_dtype(4, dtype::QuantizedS8{6.0f})
  730. .set_epsilon(1 + 1e-3)
  731. .set_max_avg_error(1e-1)
  732. .set_max_avg_biased_error(1e-1)
  733. .execs({{16, 8, 7, 7, 32},
  734. {256, 8, 3, 3, 32},
  735. {1, 8, 1, 1, 32},
  736. {16, 8, 7, 7, 32},
  737. {}});
  738. };
  739. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  740. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_128X128X64_64X64X64_2",
  741. ConvBias::DirectParam{});
  742. check(algo);
  743. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  744. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1",
  745. ConvBias::DirectParam{});
  746. check(algo);
  747. }
  748. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC) {
  749. require_compute_capability(7, 5);
  750. Checker<ConvBiasForward> checker(handle_cuda());
  751. auto check = [&checker](const std::string& algo) {
  752. checker.set_before_exec_callback(
  753. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  754. UniformIntRNG rng{-8, 8};
  755. UniformIntRNG bias_rng{-50, 50};
  756. checker.set_rng(0, &rng)
  757. .set_rng(1, &rng)
  758. .set_rng(2, &bias_rng)
  759. .set_rng(3, &rng)
  760. .set_dtype(0, dtype::QuantizedS8{1.2f})
  761. .set_dtype(1, dtype::QuantizedS8{1.3f})
  762. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  763. .set_dtype(3, dtype::QuantizedS8{19.990229f})
  764. .set_dtype(4, dtype::QuantizedS8{19.990228f})
  765. .set_epsilon(1e-3);
  766. param::ConvBias param;
  767. param.pad_h = param.pad_w = 1;
  768. param.stride_h = param.stride_w = 1;
  769. param.format = param::ConvBias::Format::NHWC;
  770. checker.set_param(param).execs(
  771. {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
  772. param.pad_h = param.pad_w = 0;
  773. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  774. checker.set_param(param).execs(
  775. {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
  776. };
  777. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  778. "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
  779. ConvBias::DirectParam{});
  780. check(algo);
  781. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  782. "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
  783. ConvBias::DirectParam{});
  784. check(algo);
  785. }
  786. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_UINT4_WEIGHT_PREPROCESS) {
  787. require_compute_capability(7, 5);
  788. Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
  789. handle_cuda());
  790. auto check = [&checker](const std::string& algo) {
  791. checker.set_before_exec_callback(
  792. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  793. UniformIntRNG rng{-8, 8};
  794. UniformIntRNG bias_rng{-50, 50};
  795. UniformIntRNG rng_u4{0, 15};
  796. checker.set_rng(0, &rng)
  797. .set_rng(1, &rng)
  798. .set_rng(2, &bias_rng)
  799. .set_rng(3, &rng_u4)
  800. .set_dtype(0, dtype::QuantizedS8{0.2f})
  801. .set_dtype(1, dtype::QuantizedS8{0.3f})
  802. .set_dtype(2, dtype::QuantizedS32{0.2f * 0.3f})
  803. .set_dtype(3, dtype::Quantized4Asymm{0.5f, 8})
  804. .set_dtype(4, dtype::Quantized4Asymm{0.5f, 4})
  805. .set_epsilon(1 + 1e-3);
  806. param::ConvBias param;
  807. param.pad_h = param.pad_w = 1;
  808. param.stride_h = param.stride_w = 1;
  809. param.format = param::ConvBias::Format::NHWC;
  810. checker.set_param(param).execs(
  811. {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
  812. param.pad_h = param.pad_w = 0;
  813. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  814. checker.set_param(param).execs(
  815. {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
  816. };
  817. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  818. "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
  819. ConvBias::DirectParam{});
  820. check(algo);
  821. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  822. "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
  823. ConvBias::DirectParam{});
  824. check(algo);
  825. }
  826. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_FLOAT) {
  827. require_compute_capability(7, 5);
  828. Checker<ConvBiasForward> checker(handle_cuda());
  829. auto check = [&checker](const std::string& algo) {
  830. checker.set_before_exec_callback(
  831. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  832. UniformIntRNG rng{-8, 8};
  833. UniformFloatRNG float_rng{-50, 50};
  834. checker.set_rng(0, &rng)
  835. .set_rng(1, &rng)
  836. .set_rng(2, &float_rng)
  837. .set_rng(3, &float_rng)
  838. .set_dtype(0, dtype::QuantizedS8(1.9980618f))
  839. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  840. .set_dtype(2, dtype::Float32())
  841. .set_dtype(3, dtype::Float32())
  842. .set_dtype(4, dtype::Float32());
  843. param::ConvBias param;
  844. param.pad_h = param.pad_w = 1;
  845. param.stride_h = param.stride_w = 1;
  846. param.format = param::ConvBias::Format::NHWC;
  847. checker.set_param(param).execs(
  848. {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
  849. param.pad_h = param.pad_w = 0;
  850. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  851. checker.set_param(param).execs(
  852. {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
  853. };
  854. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  855. "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
  856. ConvBias::DirectParam{});
  857. check(algo);
  858. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  859. "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
  860. ConvBias::DirectParam{});
  861. check(algo);
  862. }
  863. #endif
  864. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) {
  865. require_compute_capability(6, 1);
  866. using namespace conv_bias;
  867. Checker<ConvBiasForward> checker(handle_cuda());
  868. UniformIntRNG int_rng{-3, 3};
  869. UniformFloatRNG float_rng{-50, 50};
  870. ConvBias::Param param;
  871. param.format = ConvBias::Param::Format::NCHW4_NCHW;
  872. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  873. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  874. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  875. checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  876. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  877. .set_dtype(2, dtype::Float32())
  878. .set_dtype(3, dtype::Float32())
  879. .set_dtype(4, dtype::Float32())
  880. .set_rng(0, &int_rng)
  881. .set_rng(1, &int_rng)
  882. .set_rng(2, &float_rng)
  883. .set_rng(3, &float_rng)
  884. .set_param(param);
  885. auto opr = handle_cuda()->create_operator<ConvBias>();
  886. auto run = [&](const TensorShapeArray& shapes) {
  887. opr->param() = param;
  888. TensorLayout dst_layout;
  889. opr->deduce_layout(
  890. {shapes[0], dtype::Float32()}, {shapes[1], dtype::Float32()}, {}, {},
  891. dst_layout);
  892. checker.execs({shapes[0], shapes[1], shapes[2], dst_layout, {}});
  893. };
  894. run({{16, 4, 23, 40, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}});
  895. run({{16, 4, 92, 160, 4}, {24, 4, 3, 3, 4}, {1, 24, 1, 1}});
  896. run({{16, 4, 92, 160, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}});
  897. run({{16, 4, 92, 160, 4}, {16, 4, 3, 3, 4}, {1, 16, 1, 1}});
  898. run({{16, 4, 92, 160, 4}, {8, 4, 3, 3, 4}, {1, 8, 1, 1}});
  899. run({{16, 4, 46, 80, 4}, {4, 4, 3, 3, 4}, {1, 4, 1, 1}});
  900. }
  901. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW32) {
  902. require_compute_capability(6, 1);
  903. using namespace conv_bias;
  904. Checker<ConvBiasForward> checker(handle_cuda());
  905. UniformIntRNG int_rng{-3, 3};
  906. UniformIntRNG bias_rng{-50, 50};
  907. ConvBias::Param param;
  908. param.format = ConvBias::Param::Format::NCHW4_NCHW32;
  909. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  910. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  911. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  912. checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  913. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  914. .set_dtype(2, dtype::QuantizedS32(1.9980618f * 1.9980927f))
  915. .set_dtype(3, dtype::QuantizedS8(1.9980618f))
  916. .set_dtype(4, dtype::QuantizedS8(1.9980618f))
  917. .set_rng(0, &int_rng)
  918. .set_rng(1, &int_rng)
  919. .set_rng(2, &bias_rng)
  920. .set_rng(3, &int_rng)
  921. .set_param(param);
  922. auto run = [&](const TensorShapeArray& shapes) {
  923. checker.execs({shapes[0], shapes[1], shapes[2], {}, {}});
  924. };
  925. run({{16, 4, 23, 40, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}});
  926. run({{16, 4, 92, 160, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}});
  927. run({{16, 4, 46, 80, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}});
  928. }
  929. #if CUDA_VERSION >= 10020
  930. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) {
  931. require_compute_capability(7, 5);
  932. using namespace conv_bias;
  933. Checker<ConvBiasForward> checker(handle_cuda());
  934. UniformIntRNG int_rng{-3, 3};
  935. UniformIntRNG bias_rng{-50, 50};
  936. ConvBias::Param param;
  937. param.format = ConvBias::Param::Format::NCHW32_NCHW4;
  938. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  939. checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  940. ConvBias::algo_name<ConvBias::DirectParam>(
  941. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X128X32_32X64X32_1",
  942. ConvBias::DirectParam{})
  943. .c_str()));
  944. checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  945. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  946. .set_dtype(2, dtype::QuantizedS32(1.9980618f * 1.9980927f))
  947. .set_dtype(3, dtype::QuantizedS8(1.9980618f))
  948. .set_dtype(4, dtype::QuantizedS8(1.9980618f))
  949. .set_rng(0, &int_rng)
  950. .set_rng(1, &int_rng)
  951. .set_rng(2, &bias_rng)
  952. .set_rng(3, &int_rng)
  953. .set_param(param);
  954. auto run = [&](const TensorShapeArray& shapes) {
  955. checker.execs({shapes[0], shapes[1], shapes[2], {}, {}});
  956. };
  957. run({{16, 2, 23, 40, 32}, {20, 2, 3, 3, 32}, {1, 5, 1, 1, 4}});
  958. run({{16, 1, 92, 160, 32}, {24, 1, 3, 3, 32}, {1, 6, 1, 1, 4}});
  959. run({{16, 2, 46, 80, 32}, {4, 2, 3, 3, 32}, {1, 1, 1, 1, 4}});
  960. }
  961. #endif
  962. #if MEGDNN_WITH_BENCHMARK
  963. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) {
  964. require_compute_capability(6, 1);
  965. benchmark_target_algo(
  966. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  967. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  968. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  969. param::ConvBias::Format::CHWN4);
  970. }
  971. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4) {
  972. require_compute_capability(6, 1);
  973. benchmark_target_algo(
  974. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  975. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  976. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  977. param::ConvBias::Format::NCHW4);
  978. }
  979. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE) {
  980. require_compute_capability(7, 5);
  981. benchmark_target_algo_with_cudnn_tsc(
  982. handle_cuda(), get_resnet50_bench_args(256), dtype::QuantizedS8{1.2f},
  983. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  984. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  985. param::ConvBias::Format::CHWN4);
  986. }
  987. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE_ALL_ALGO) {
  988. require_compute_capability(7, 5);
  989. benchmark_target_algo_with_cudnn_tsc(
  990. handle_cuda(), get_resnet50_bench_args(256), dtype::QuantizedS8{1.2f},
  991. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  992. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  993. }
  994. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_DET_ALL_ALGO) {
  995. require_compute_capability(7, 5);
  996. benchmark_target_algo_with_cudnn_tsc(
  997. handle_cuda(), get_detection_bench_args(), dtype::QuantizedS8{1.2f},
  998. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  999. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  1000. }
  1001. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_TENSORCORE) {
  1002. require_compute_capability(7, 5);
  1003. benchmark_target_algo_with_cudnn_tsc(
  1004. handle_cuda(), get_resnet50_bench_args(256), dtype::QuantizedS8{1.2f},
  1005. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1006. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1007. param::ConvBias::Format::NCHW4);
  1008. }
  1009. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
  1010. require_compute_capability(6, 1);
  1011. std::vector<BenchArgs> args;
  1012. args.push_back(BenchArgs{64, 4, 224, 224, 64, 7, 2});
  1013. benchmark_target_algo(
  1014. handle_cuda(), args, dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1015. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1016. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::CHWN4);
  1017. }
  1018. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW) {
  1019. CUBenchmarker<ConvBiasForward> benchmarker(handle_cuda());
  1020. size_t RUNS = 1000;
  1021. benchmarker.set_display(false).set_times(RUNS);
  1022. using namespace conv_bias;
  1023. UniformIntRNG int_rng{-3, 3};
  1024. UniformIntRNG bias_rng{-50, 50};
  1025. ConvBias::Param param;
  1026. param.format = ConvBias::Param::Format::NCHW4_NCHW;
  1027. param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY;
  1028. benchmarker.set_before_exec_callback(
  1029. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  1030. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  1031. benchmarker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
  1032. .set_dtype(1, dtype::QuantizedS8(1.9980927f))
  1033. .set_dtype(2, dtype::Float32())
  1034. .set_dtype(3, dtype::Float32())
  1035. .set_dtype(4, dtype::Float32())
  1036. .set_rng(0, &int_rng)
  1037. .set_rng(1, &int_rng)
  1038. .set_param(param);
  1039. auto run = [&](const TensorShapeArray& shapes) {
  1040. auto time_in_ms =
  1041. benchmarker.execs({shapes[0], shapes[1], shapes[2], {}, {}}) / RUNS;
  1042. printf("src=%s, filter=%s, dst=%s, time=%.2f\n", shapes[0].to_string().c_str(),
  1043. shapes[1].to_string().c_str(), shapes[2].to_string().c_str(),
  1044. time_in_ms);
  1045. };
  1046. run({{16, 16, 224, 224, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
  1047. run({{16, 16, 92, 160, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
  1048. run({{16, 16, 46, 80, 4}, {32, 16, 3, 3, 4}, {1, 32, 1, 1}});
  1049. }
  1050. #if CUDA_VERSION >= 10020
  1051. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
  1052. require_compute_capability(7, 5);
  1053. benchmark_target_algo_with_cudnn_tsc(
  1054. handle_cuda(), get_resnet50_bench_args(256), dtype::QuantizedS8{1.2f},
  1055. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1056. dtype::QuantizedS8{1.0f}, "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
  1057. param::ConvBias::Format::NCHW32);
  1058. }
  1059. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NHWC) {
  1060. require_compute_capability(7, 5);
  1061. benchmark_target_algo_with_cudnn_tsc(
  1062. handle_cuda(), get_det_first_bench_args(16), dtype::QuantizedS8{1.2f},
  1063. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1064. dtype::QuantizedS8{1.0f}, "DIRECT:INT8_NHWC_IMMA_IMPLICIT_GEMM",
  1065. param::ConvBias::Format::NHWC);
  1066. }
  1067. #endif
  1068. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
  1069. require_compute_capability(6, 1);
  1070. benchmark_target_algo(
  1071. handle_cuda(), get_resnet50_bench_args(64), dtype::QuantizedS8{1.2f},
  1072. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1073. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  1074. param::ConvBias::Format::NCHW4);
  1075. }
  1076. TEST_F(CUDA, BENCHMARK_SASS_CONV_BIAS_INT8_NCHW4_DET_FIRST) {
  1077. require_compute_capability(6, 1);
  1078. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1079. "SASS_INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32_64", ConvBias::DirectParam{});
  1080. benchmark_target_algo(
  1081. handle_cuda(), get_det_first_bench_args(16), dtype::QuantizedS8{1.2f},
  1082. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1083. dtype::QuantizedS8{1.0f}, algo.c_str(), param::ConvBias::Format::NCHW4);
  1084. }
  1085. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4_DET_FIRST) {
  1086. require_compute_capability(6, 1);
  1087. benchmark_target_algo(
  1088. handle_cuda(), get_det_first_bench_args(16), dtype::QuantizedS8{1.2f},
  1089. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1090. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16",
  1091. param::ConvBias::Format::NCHW4);
  1092. }
  1093. #endif
  1094. } // namespace conv
  1095. } // namespace test
  1096. } // namespace megdnn
  1097. // vim: syntax=cpp.doxygen