You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_int8.cpp 55 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246
  1. /**
  2. * \file dnn/test/cuda/conv_bias_int8.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "src/common/utils.h"
  13. #include "src/cuda/cudnn_with_check.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/conv_bias.h"
  16. #include "test/cuda/benchmark.h"
  17. #include "test/cuda/fixture.h"
  18. #include "test/cuda/utils.h"
  19. #define V1(x) #x
  20. #define V(x) V1(x)
  21. namespace megdnn {
  22. namespace test {
  23. namespace {
  24. #if MEGDNN_WITH_BENCHMARK
  25. struct BenchArgs {
  26. size_t n, ci, hi, wi, co, f, s;
  27. };
  28. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
  29. std::vector<BenchArgs> args;
  30. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  31. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  32. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  33. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  34. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  35. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  36. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  37. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  38. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  39. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  40. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  41. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  42. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  43. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  44. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  45. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  46. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  47. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  48. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  49. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  50. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  51. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  52. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  53. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  54. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  55. return args;
  56. }
  57. std::vector<BenchArgs> get_detection_bench_args(size_t batch = 16) {
  58. std::vector<BenchArgs> args;
  59. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  60. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  61. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  62. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  63. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  64. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  66. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  67. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  68. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  69. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  70. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  71. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  72. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  73. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  74. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  75. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  76. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  77. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  78. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  79. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  80. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  81. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  82. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  83. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  84. return args;
  85. }
  86. void benchmark_target_algo(
  87. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  88. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  89. const char* algo = nullptr,
  90. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  91. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  92. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  93. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  94. size_t RUNS = 1000;
  95. benchmarker.set_display(false).set_times(RUNS);
  96. benchmarker_cudnn.set_display(false).set_times(RUNS);
  97. #define CUDNN_VERSION_STRING \
  98. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  99. benchmarker_cudnn.set_before_exec_callback(
  100. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  101. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  102. "ALGO_IMPLICIT_PRECOMP_"
  103. "GEMM" CUDNN_VERSION_STRING));
  104. benchmarker.set_dtype(0, src_dtype)
  105. .set_dtype(1, filter_dtype)
  106. .set_dtype(2, bias_dtype)
  107. .set_dtype(3, dst_dtype)
  108. .set_dtype(4, dst_dtype);
  109. benchmarker_cudnn.set_dtype(0, src_dtype)
  110. .set_dtype(1, filter_dtype)
  111. .set_dtype(2, bias_dtype)
  112. .set_dtype(3, dst_dtype)
  113. .set_dtype(4, dst_dtype);
  114. using Param = ConvBias::Param;
  115. using Format = Param::Format;
  116. // helper function to change format
  117. auto get_tensor_shape = [](TensorShape shape,
  118. Format format) -> TensorShape {
  119. TensorShape ret;
  120. if (format == Format::NCHW4) {
  121. ret = static_cast<TensorShape>(
  122. TensorLayout{shape, dtype::Int8()}
  123. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  124. shape[3]})
  125. .dimshuffle({0, 1, 3, 4, 2}));
  126. } else if (format == Format::CHWN4) {
  127. ret = static_cast<TensorShape>(
  128. TensorLayout{shape, dtype::Int8()}
  129. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  130. shape[3]})
  131. .dimshuffle({1, 3, 4, 0, 2}));
  132. }
  133. return ret;
  134. };
  135. for (auto&& arg : args) {
  136. Param param;
  137. param.pad_h = param.pad_w = arg.f / 2;
  138. param.stride_h = param.stride_w = arg.s;
  139. param.format = format;
  140. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  141. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  142. benchmarker.set_param(param);
  143. if (!algo) {
  144. benchmarker.proxy()->target_algo = nullptr;
  145. }
  146. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  147. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  148. z{arg.n, arg.co, ho, wo}, dst = z;
  149. float time_in_ms = 0.f;
  150. if (algo) {
  151. time_in_ms =
  152. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  153. CUTimer>(benchmarker,
  154. {get_tensor_shape(src, format),
  155. get_tensor_shape(filter, format),
  156. get_tensor_shape(bias, format),
  157. {},
  158. {}},
  159. algo) /
  160. RUNS;
  161. } else {
  162. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  163. get_tensor_shape(filter, format),
  164. get_tensor_shape(bias, format),
  165. {},
  166. {}}) /
  167. RUNS;
  168. }
  169. Format format_cudnn = Format::NCHW4;
  170. param.format = format_cudnn;
  171. benchmarker_cudnn.set_param(param);
  172. auto time_in_ms_cudnn =
  173. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  174. get_tensor_shape(filter, format_cudnn),
  175. get_tensor_shape(bias, format_cudnn),
  176. {},
  177. {}}) /
  178. RUNS;
  179. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  180. (1e12);
  181. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  182. "time(cudnn)=%.2f %.2fTops, "
  183. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  184. src.to_string().c_str(), filter.to_string().c_str(),
  185. dst.to_string().c_str(), algo, time_in_ms,
  186. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  187. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  188. time_in_ms_cudnn / time_in_ms);
  189. printf("bench with z tensor\n");
  190. if (algo) {
  191. time_in_ms =
  192. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  193. CUTimer>(benchmarker,
  194. {get_tensor_shape(src, format),
  195. get_tensor_shape(filter, format),
  196. get_tensor_shape(bias, format),
  197. get_tensor_shape(z, format),
  198. {}},
  199. algo) /
  200. RUNS;
  201. } else {
  202. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  203. get_tensor_shape(filter, format),
  204. get_tensor_shape(bias, format),
  205. get_tensor_shape(z, format),
  206. {}}) /
  207. RUNS;
  208. }
  209. time_in_ms_cudnn =
  210. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  211. get_tensor_shape(filter, format_cudnn),
  212. get_tensor_shape(bias, format_cudnn),
  213. get_tensor_shape(z, format_cudnn),
  214. {}}) /
  215. RUNS;
  216. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  217. "time(cudnn)=%.2f %.2fTops, "
  218. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  219. src.to_string().c_str(), filter.to_string().c_str(),
  220. dst.to_string().c_str(), algo, time_in_ms,
  221. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  222. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  223. time_in_ms_cudnn / time_in_ms);
  224. }
  225. }
  226. void benchmark_target_algo_with_cudnn_tsc(
  227. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  228. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  229. const char* algo = nullptr,
  230. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  231. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  232. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  233. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  234. size_t RUNS = 1000;
  235. benchmarker.set_display(false).set_times(RUNS);
  236. benchmarker_cudnn.set_display(false).set_times(RUNS);
  237. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  238. new OprProxy<ConvBiasForward>{true}};
  239. if (!algo) {
  240. benchmarker.set_proxy(proxy);
  241. }
  242. benchmarker_cudnn.set_before_exec_callback(
  243. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  244. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  245. "ALGO_IMPLICIT_PRECOMP_"
  246. "GEMM" CUDNN_VERSION_STRING));
  247. #undef CUDNN_VERSION_STRING
  248. benchmarker.set_dtype(0, src_dtype)
  249. .set_dtype(1, filter_dtype)
  250. .set_dtype(2, bias_dtype)
  251. .set_dtype(3, dst_dtype)
  252. .set_dtype(4, dst_dtype);
  253. benchmarker_cudnn.set_dtype(0, src_dtype)
  254. .set_dtype(1, filter_dtype)
  255. .set_dtype(2, bias_dtype)
  256. .set_dtype(3, dst_dtype)
  257. .set_dtype(4, dst_dtype);
  258. using Param = ConvBias::Param;
  259. using Format = Param::Format;
  260. // helper function to change format
  261. auto get_tensor_shape = [](TensorShape shape,
  262. Format format) -> TensorShape {
  263. TensorShape ret;
  264. if (format == Format::NCHW4) {
  265. ret = static_cast<TensorShape>(
  266. TensorLayout{shape, dtype::Int8()}
  267. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  268. shape[3]})
  269. .dimshuffle({0, 1, 3, 4, 2}));
  270. } else if (format == Format::NCHW32) {
  271. ret = static_cast<TensorShape>(
  272. TensorLayout{shape, dtype::Int8()}
  273. .reshape({shape[0], shape[1] / 32, 32, shape[2],
  274. shape[3]})
  275. .dimshuffle({0, 1, 3, 4, 2}));
  276. } else if (format == Format::CHWN4) {
  277. ret = static_cast<TensorShape>(
  278. TensorLayout{shape, dtype::Int8()}
  279. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  280. shape[3]})
  281. .dimshuffle({1, 3, 4, 0, 2}));
  282. }
  283. return ret;
  284. };
  285. for (auto&& arg : args) {
  286. Param param;
  287. param.pad_h = param.pad_w = arg.f / 2;
  288. param.stride_h = param.stride_w = arg.s;
  289. param.format = format;
  290. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  291. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  292. benchmarker.set_param(param);
  293. if (!algo) {
  294. benchmarker.proxy()->target_algo = nullptr;
  295. }
  296. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  297. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  298. z{arg.n, arg.co, ho, wo}, dst = z;
  299. // skip testcase which cannot enable nchw32 tensorcore
  300. if (format == Format::NCHW32 && (arg.co % 32 != 0 || arg.ci % 32 != 0))
  301. continue;
  302. // skip testcase which cannot enable nchw4/chwn4 tensorcore
  303. if ((format == Format::CHWN4 || format == Format::NCHW4) &&
  304. (arg.ci % 16 != 0))
  305. continue;
  306. float time_in_ms = 0.f;
  307. if (algo) {
  308. time_in_ms =
  309. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  310. CUTimer>(benchmarker,
  311. {get_tensor_shape(src, format),
  312. get_tensor_shape(filter, format),
  313. get_tensor_shape(bias, format),
  314. {},
  315. {}},
  316. algo) /
  317. RUNS;
  318. } else {
  319. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  320. get_tensor_shape(filter, format),
  321. get_tensor_shape(bias, format),
  322. {},
  323. {}}) /
  324. RUNS;
  325. }
  326. Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
  327. ? Format::NCHW32
  328. : Format::NCHW4;
  329. param.format = format_cudnn;
  330. benchmarker_cudnn.set_param(param);
  331. auto time_in_ms_cudnn =
  332. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  333. get_tensor_shape(filter, format_cudnn),
  334. get_tensor_shape(bias, format_cudnn),
  335. {},
  336. {}}) /
  337. RUNS;
  338. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  339. (1e12);
  340. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  341. "time(cudnn)=%.2f %.2fTops, "
  342. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  343. src.to_string().c_str(), filter.to_string().c_str(),
  344. dst.to_string().c_str(), algo, time_in_ms,
  345. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  346. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  347. time_in_ms_cudnn / time_in_ms);
  348. printf("bench with z tensor\n");
  349. if (algo) {
  350. time_in_ms =
  351. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  352. CUTimer>(benchmarker,
  353. {get_tensor_shape(src, format),
  354. get_tensor_shape(filter, format),
  355. get_tensor_shape(bias, format),
  356. get_tensor_shape(z, format),
  357. {}},
  358. algo) /
  359. RUNS;
  360. } else {
  361. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  362. get_tensor_shape(filter, format),
  363. get_tensor_shape(bias, format),
  364. get_tensor_shape(z, format),
  365. {}}) /
  366. RUNS;
  367. }
  368. time_in_ms_cudnn =
  369. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  370. get_tensor_shape(filter, format_cudnn),
  371. get_tensor_shape(bias, format_cudnn),
  372. get_tensor_shape(z, format_cudnn),
  373. {}}) /
  374. RUNS;
  375. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  376. "time(cudnn)=%.2f %.2fTops, "
  377. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  378. src.to_string().c_str(), filter.to_string().c_str(),
  379. dst.to_string().c_str(), algo, time_in_ms,
  380. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  381. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  382. time_in_ms_cudnn / time_in_ms);
  383. }
  384. }
  385. #endif
  386. } // namespace
  387. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) {
  388. require_compute_capability(6, 1);
  389. conv_bias::check_conv_bias(
  390. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  391. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  392. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  393. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(1));
  394. }
  395. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_3x3) {
  396. require_compute_capability(6, 1);
  397. conv_bias::check_conv_bias(
  398. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  399. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  400. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  401. param::ConvBias::Format::NCHW4);
  402. }
  403. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_5x5) {
  404. require_compute_capability(6, 1);
  405. conv_bias::check_conv_bias(
  406. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  407. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  408. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  409. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(5));
  410. }
  411. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_7x7) {
  412. require_compute_capability(6, 1);
  413. conv_bias::check_conv_bias(
  414. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  415. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  416. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  417. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(7));
  418. }
  419. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_WITH_Z) {
  420. require_compute_capability(6, 1);
  421. Checker<ConvBiasForward> checker(handle_cuda());
  422. checker.set_before_exec_callback(
  423. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  424. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  425. UniformIntRNG rng{-3, 3};
  426. UniformIntRNG bias_rng{-50, 50};
  427. checker.set_rng(0, &rng)
  428. .set_rng(1, &rng)
  429. .set_rng(2, &bias_rng)
  430. .set_rng(3, &rng)
  431. .set_dtype(0, dtype::QuantizedS8{1.2f})
  432. .set_dtype(1, dtype::QuantizedS8{1.3f})
  433. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  434. .set_dtype(3, dtype::QuantizedS8{1.1f})
  435. .set_dtype(4, dtype::QuantizedS8{1.0f})
  436. .set_epsilon(1 + 1e-3)
  437. .set_max_avg_error(1e-1)
  438. .set_max_avg_biased_error(1e-1);
  439. param::ConvBias param;
  440. param.pad_h = param.pad_w = 1;
  441. param.stride_h = param.stride_w = 1;
  442. param.format = param::ConvBias::Format::NCHW4;
  443. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  444. {16, 4, 3, 3, 4},
  445. {1, 4, 1, 1, 4},
  446. {32, 4, 12, 12, 4},
  447. {}});
  448. }
  449. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_STRIDE2_WITH_Z) {
  450. require_compute_capability(6, 1);
  451. Checker<ConvBiasForward> checker(handle_cuda());
  452. checker.set_before_exec_callback(
  453. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  454. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  455. UniformIntRNG rng{-3, 3};
  456. UniformIntRNG bias_rng{-50, 50};
  457. checker.set_rng(0, &rng)
  458. .set_rng(1, &rng)
  459. .set_rng(2, &bias_rng)
  460. .set_rng(3, &rng)
  461. .set_dtype(0, dtype::QuantizedS8{1.2f})
  462. .set_dtype(1, dtype::QuantizedS8{1.3f})
  463. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  464. .set_dtype(3, dtype::QuantizedS8{1.1f})
  465. .set_dtype(4, dtype::QuantizedS8{1.0f})
  466. .set_epsilon(1 + 1e-3)
  467. .set_max_avg_error(1e-1)
  468. .set_max_avg_biased_error(1e-1);
  469. param::ConvBias param;
  470. param.pad_h = param.pad_w = 1;
  471. param.stride_h = param.stride_w = 2;
  472. param.format = param::ConvBias::Format::NCHW4;
  473. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  474. {16, 4, 3, 3, 4},
  475. {1, 4, 1, 1, 4},
  476. {32, 4, 6, 6, 4},
  477. {}});
  478. }
  479. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_1x1) {
  480. require_compute_capability(6, 1);
  481. conv_bias::check_conv_bias(
  482. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  483. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  484. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  485. param::ConvBias::Format::NCHW4,
  486. conv_bias::get_int8_nchw4_args_check_bounds(1));
  487. }
  488. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_3x3) {
  489. require_compute_capability(6, 1);
  490. conv_bias::check_conv_bias(
  491. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  492. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  493. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  494. param::ConvBias::Format::NCHW4,
  495. conv_bias::get_int8_nchw4_args_check_bounds(3));
  496. }
  497. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_5x5) {
  498. require_compute_capability(6, 1);
  499. conv_bias::check_conv_bias(
  500. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  501. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  502. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  503. param::ConvBias::Format::NCHW4,
  504. conv_bias::get_int8_nchw4_args_check_bounds(5));
  505. }
  506. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_7x7) {
  507. require_compute_capability(6, 1);
  508. conv_bias::check_conv_bias(
  509. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  510. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  511. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  512. param::ConvBias::Format::NCHW4,
  513. conv_bias::get_int8_nchw4_args_check_bounds(7));
  514. }
  515. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4) {
  516. require_compute_capability(6, 1);
  517. conv_bias::check_conv_bias(
  518. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  519. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  520. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  521. param::ConvBias::Format::CHWN4);
  522. }
  523. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_WITH_Z) {
  524. require_compute_capability(6, 1);
  525. Checker<ConvBiasForward> checker(handle_cuda());
  526. checker.set_before_exec_callback(
  527. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  528. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  529. UniformIntRNG rng{-3, 3};
  530. UniformIntRNG bias_rng{-50, 50};
  531. checker.set_rng(0, &rng)
  532. .set_rng(1, &rng)
  533. .set_rng(2, &bias_rng)
  534. .set_rng(3, &rng)
  535. .set_dtype(0, dtype::QuantizedS8{1.2f})
  536. .set_dtype(1, dtype::QuantizedS8{1.3f})
  537. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  538. .set_dtype(3, dtype::QuantizedS8{1.1f})
  539. .set_dtype(4, dtype::QuantizedS8{1.1f})
  540. .set_epsilon(1 + 1e-3)
  541. .set_max_avg_error(1e-1)
  542. .set_max_avg_biased_error(1e-1);
  543. param::ConvBias param;
  544. param.pad_h = param.pad_w = 1;
  545. param.stride_h = param.stride_w = 1;
  546. param.format = param::ConvBias::Format::CHWN4;
  547. checker.set_param(param).execs({{4, 12, 12, 32, 4},
  548. {4, 3, 3, 16, 4},
  549. {4, 1, 1, 1, 4},
  550. {4, 12, 12, 32, 4},
  551. {}});
  552. }
  553. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
  554. require_compute_capability(6, 1);
  555. Checker<ConvBiasForward> checker(handle_cuda());
  556. checker.set_before_exec_callback(
  557. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  558. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  559. UniformIntRNG rng{-3, 3};
  560. UniformIntRNG bias_rng{-50, 50};
  561. checker.set_rng(0, &rng)
  562. .set_rng(1, &rng)
  563. .set_rng(2, &bias_rng)
  564. .set_rng(3, &rng)
  565. .set_dtype(0, dtype::QuantizedS8{1.2f})
  566. .set_dtype(1, dtype::QuantizedS8{1.3f})
  567. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  568. .set_dtype(4, dtype::QuantizedS8{0.001f})
  569. .set_epsilon(1 + 1e-3)
  570. .set_max_avg_error(1e-1)
  571. .set_max_avg_biased_error(1e-1);
  572. param::ConvBias param;
  573. param.pad_h = param.pad_w = 1;
  574. param.stride_h = param.stride_w = 1;
  575. param.format = param::ConvBias::Format::CHWN4;
  576. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  577. checker.set_param(param).execs({{4, 12, 12, 32, 4},
  578. {4, 3, 3, 16, 4},
  579. {4, 1, 1, 1, 4},
  580. {},
  581. {}});
  582. }
  583. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
  584. require_compute_capability(6, 1);
  585. conv_bias::check_conv_bias(
  586. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  587. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  588. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  589. param::ConvBias::Format::CHWN4,
  590. conv_bias::get_int8_chwn4_args_check_bounds(3));
  591. }
  592. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1) {
  593. require_compute_capability(6, 1);
  594. conv_bias::check_conv_bias(
  595. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  596. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  597. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  598. param::ConvBias::Format::CHWN4,
  599. conv_bias::get_int8_chwn4_small_channel_args(1));
  600. }
  601. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_3x3) {
  602. require_compute_capability(6, 1);
  603. conv_bias::check_conv_bias(
  604. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  605. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  606. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  607. param::ConvBias::Format::CHWN4,
  608. conv_bias::get_int8_chwn4_small_channel_args(3));
  609. }
  610. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5) {
  611. require_compute_capability(6, 1);
  612. conv_bias::check_conv_bias(
  613. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  614. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  615. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  616. param::ConvBias::Format::CHWN4,
  617. conv_bias::get_int8_chwn4_small_channel_args(5));
  618. }
  619. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7) {
  620. require_compute_capability(6, 1);
  621. conv_bias::check_conv_bias(
  622. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  623. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  624. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  625. param::ConvBias::Format::CHWN4,
  626. conv_bias::get_int8_chwn4_small_channel_args(7));
  627. }
  628. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_SMALL_CHANNEL_CHECK_BOUNDS) {
  629. require_compute_capability(6, 1);
  630. conv_bias::check_conv_bias(
  631. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  632. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  633. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  634. param::ConvBias::Format::NCHW4,
  635. conv_bias::get_int8_nchw4_small_channel_args_check_bounds(3));
  636. }
  637. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1_CHECK_BOUNDS) {
  638. require_compute_capability(6, 1);
  639. conv_bias::check_conv_bias(
  640. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  641. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  642. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  643. param::ConvBias::Format::CHWN4,
  644. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(1));
  645. }
  646. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5_CHECK_BOUNDS) {
  647. require_compute_capability(6, 1);
  648. conv_bias::check_conv_bias(
  649. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  650. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  651. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  652. param::ConvBias::Format::CHWN4,
  653. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(5));
  654. }
  655. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7_CHECK_BOUNDS) {
  656. require_compute_capability(6, 1);
  657. conv_bias::check_conv_bias(
  658. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  659. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  660. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  661. param::ConvBias::Format::CHWN4,
  662. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(7));
  663. }
  664. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_1x1) {
  665. require_compute_capability(7, 5);
  666. conv_bias::check_conv_bias(
  667. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  668. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  669. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  670. param::ConvBias::Format::NCHW4,
  671. conv_bias::get_int8_nchw4_tensorcore_args(1));
  672. }
  673. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_3x3) {
  674. require_compute_capability(7, 5);
  675. conv_bias::check_conv_bias(
  676. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  677. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  678. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  679. param::ConvBias::Format::NCHW4,
  680. conv_bias::get_int8_nchw4_tensorcore_args(3));
  681. }
  682. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_5x5) {
  683. require_compute_capability(7, 5);
  684. conv_bias::check_conv_bias(
  685. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  686. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  687. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  688. param::ConvBias::Format::NCHW4,
  689. conv_bias::get_int8_nchw4_tensorcore_args(5));
  690. }
  691. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_7x7) {
  692. require_compute_capability(7, 5);
  693. conv_bias::check_conv_bias(
  694. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  695. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  696. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  697. param::ConvBias::Format::NCHW4,
  698. conv_bias::get_int8_nchw4_tensorcore_args(7));
  699. }
  700. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  701. require_compute_capability(7, 5);
  702. conv_bias::check_conv_bias(
  703. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  704. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  705. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  706. param::ConvBias::Format::NCHW4,
  707. conv_bias::get_int8_nchw4_args_check_bounds(3));
  708. }
  709. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  710. require_compute_capability(7, 5);
  711. conv_bias::check_conv_bias(
  712. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  713. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  714. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  715. param::ConvBias::Format::NCHW4,
  716. conv_bias::get_int8_nchw4_args_check_bounds(3));
  717. }
  718. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  719. require_compute_capability(7, 5);
  720. conv_bias::check_conv_bias(
  721. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  722. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  723. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  724. param::ConvBias::Format::NCHW4,
  725. conv_bias::get_int8_nchw4_args_check_bounds(3));
  726. }
  727. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_0) {
  728. require_compute_capability(7, 5);
  729. conv_bias::check_conv_bias(
  730. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  731. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  732. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  733. param::ConvBias::Format::CHWN4,
  734. conv_bias::get_int8_chwn4_tensorcore_args(3));
  735. }
  736. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_1) {
  737. require_compute_capability(7, 5);
  738. conv_bias::check_conv_bias(
  739. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  740. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  741. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  742. param::ConvBias::Format::CHWN4,
  743. conv_bias::get_int8_chwn4_tensorcore_args(3));
  744. }
  745. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_2) {
  746. require_compute_capability(7, 5);
  747. conv_bias::check_conv_bias(
  748. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  749. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  750. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  751. param::ConvBias::Format::CHWN4,
  752. conv_bias::get_int8_chwn4_tensorcore_args(3));
  753. }
  754. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_1x1) {
  755. require_compute_capability(7, 5);
  756. conv_bias::check_conv_bias(
  757. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  758. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  759. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  760. param::ConvBias::Format::CHWN4,
  761. conv_bias::get_int8_chwn4_args_check_bounds(1));
  762. }
  763. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_5x5) {
  764. require_compute_capability(7, 5);
  765. conv_bias::check_conv_bias(
  766. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  767. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  768. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  769. param::ConvBias::Format::CHWN4,
  770. conv_bias::get_int8_chwn4_args_check_bounds(5));
  771. }
  772. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_7x7) {
  773. require_compute_capability(7, 5);
  774. conv_bias::check_conv_bias(
  775. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  776. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  777. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  778. param::ConvBias::Format::CHWN4,
  779. conv_bias::get_int8_chwn4_args_check_bounds(7));
  780. }
  781. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_WITH_Z) {
  782. require_compute_capability(7, 5);
  783. Checker<ConvBiasForward> checker(handle_cuda());
  784. checker.set_before_exec_callback(
  785. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  786. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  787. UniformIntRNG rng{-3, 3};
  788. UniformIntRNG bias_rng{-50, 50};
  789. checker.set_rng(0, &rng)
  790. .set_rng(1, &rng)
  791. .set_rng(2, &bias_rng)
  792. .set_rng(3, &rng)
  793. .set_dtype(0, dtype::QuantizedS8{1.2f})
  794. .set_dtype(1, dtype::QuantizedS8{1.3f})
  795. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  796. .set_dtype(3, dtype::QuantizedS8{1.1f})
  797. .set_dtype(4, dtype::QuantizedS8{1.0f})
  798. .set_epsilon(1 + 1e-3)
  799. .set_max_avg_error(1e-1)
  800. .set_max_avg_biased_error(1e-1);
  801. param::ConvBias param;
  802. param.pad_h = param.pad_w = 1;
  803. param.stride_h = param.stride_w = 1;
  804. param.format = param::ConvBias::Format::NCHW4;
  805. checker.set_param(param).execs({{64, 8, 12, 12, 4},
  806. {64, 8, 3, 3, 4},
  807. {1, 16, 1, 1, 4},
  808. {64, 16, 12, 12, 4},
  809. {}});
  810. }
  811. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_WITH_Z) {
  812. require_compute_capability(7, 5);
  813. Checker<ConvBiasForward> checker(handle_cuda());
  814. checker.set_before_exec_callback(
  815. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  816. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  817. UniformIntRNG rng{-3, 3};
  818. UniformIntRNG bias_rng{-50, 50};
  819. checker.set_rng(0, &rng)
  820. .set_rng(1, &rng)
  821. .set_rng(2, &bias_rng)
  822. .set_rng(3, &rng)
  823. .set_dtype(0, dtype::QuantizedS8{1.2f})
  824. .set_dtype(1, dtype::QuantizedS8{1.3f})
  825. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  826. .set_dtype(3, dtype::QuantizedS8{1.1f})
  827. .set_dtype(4, dtype::QuantizedS8{1.0f})
  828. .set_epsilon(1 + 1e-3)
  829. .set_max_avg_error(1e-1)
  830. .set_max_avg_biased_error(1e-1);
  831. param::ConvBias param;
  832. param.pad_h = param.pad_w = 1;
  833. param.stride_h = param.stride_w = 1;
  834. param.format = param::ConvBias::Format::CHWN4;
  835. checker.set_param(param).execs({{8, 12, 12, 64, 4},
  836. {8, 3, 3, 64, 4},
  837. {16, 1, 1, 1, 4},
  838. {16, 12, 12, 64, 4},
  839. {}});
  840. }
  841. TEST_F(CUDA,
  842. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  843. require_compute_capability(7, 5);
  844. conv_bias::check_conv_bias(
  845. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  846. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  847. handle_cuda(),
  848. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  849. param::ConvBias::Format::CHWN4,
  850. conv_bias::get_int8_chwn4_args_check_bounds(3));
  851. }
  852. TEST_F(CUDA,
  853. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  854. require_compute_capability(7, 5);
  855. conv_bias::check_conv_bias(
  856. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  857. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  858. handle_cuda(),
  859. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  860. param::ConvBias::Format::CHWN4,
  861. conv_bias::get_int8_chwn4_args_check_bounds(3));
  862. }
  863. TEST_F(CUDA,
  864. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  865. require_compute_capability(7, 5);
  866. conv_bias::check_conv_bias(
  867. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  868. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  869. handle_cuda(),
  870. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  871. param::ConvBias::Format::CHWN4,
  872. conv_bias::get_int8_chwn4_args_check_bounds(3));
  873. }
  874. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_0) {
  875. require_compute_capability(7, 5);
  876. conv_bias::check_conv_bias(
  877. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  878. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  879. handle_cuda(),
  880. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  881. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  882. }
  883. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_1) {
  884. require_compute_capability(7, 5);
  885. conv_bias::check_conv_bias(
  886. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  887. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  888. handle_cuda(),
  889. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  890. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  891. }
  892. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_2) {
  893. require_compute_capability(7, 5);
  894. conv_bias::check_conv_bias(
  895. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  896. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  897. handle_cuda(),
  898. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  899. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  900. }
  901. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_0) {
  902. require_compute_capability(7, 5);
  903. conv_bias::check_conv_bias(
  904. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  905. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  906. handle_cuda(),
  907. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  908. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  909. }
  910. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_1) {
  911. require_compute_capability(7, 5);
  912. conv_bias::check_conv_bias(
  913. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  914. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  915. handle_cuda(),
  916. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  917. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  918. }
  919. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_2) {
  920. require_compute_capability(7, 5);
  921. conv_bias::check_conv_bias(
  922. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  923. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  924. handle_cuda(),
  925. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  926. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  927. }
  928. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1) {
  929. require_compute_capability(7, 5);
  930. conv_bias::check_conv_bias(
  931. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  932. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  933. handle_cuda(),
  934. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  935. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(1));
  936. }
  937. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5) {
  938. require_compute_capability(7, 5);
  939. conv_bias::check_conv_bias(
  940. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  941. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  942. handle_cuda(),
  943. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  944. param::ConvBias::Format::CHWN4,
  945. conv_bias::get_int8_chwn4_args_small_batch(5));
  946. }
  947. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_7x7) {
  948. require_compute_capability(7, 5);
  949. conv_bias::check_conv_bias(
  950. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  951. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  952. handle_cuda(),
  953. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  954. param::ConvBias::Format::CHWN4,
  955. conv_bias::get_int8_chwn4_args_small_batch(7));
  956. }
  957. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_1) {
  958. require_compute_capability(7, 5);
  959. conv_bias::check_conv_bias(
  960. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  961. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  962. handle_cuda(),
  963. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  964. param::ConvBias::Format::CHWN4,
  965. conv_bias::get_int8_chwn4_args_small_batch(5));
  966. }
  967. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_2) {
  968. require_compute_capability(7, 5);
  969. conv_bias::check_conv_bias(
  970. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  971. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  972. handle_cuda(),
  973. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  974. param::ConvBias::Format::CHWN4,
  975. conv_bias::get_int8_chwn4_args_small_batch(5));
  976. }
  977. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_1) {
  978. require_compute_capability(7, 5);
  979. conv_bias::check_conv_bias(
  980. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  981. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  982. handle_cuda(),
  983. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  984. param::ConvBias::Format::CHWN4,
  985. conv_bias::get_int8_chwn4_args_small_batch(1));
  986. }
  987. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
  988. require_compute_capability(7, 5);
  989. conv_bias::check_conv_bias(
  990. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  991. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  992. handle_cuda(),
  993. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  994. param::ConvBias::Format::CHWN4,
  995. conv_bias::get_int8_chwn4_args_small_batch(1));
  996. }
  997. #if CUDA_VERSION >= 10020
  998. /// \note: we only check several cases and block sizes in megdnn_test, the full
  999. /// testcases are written in cutlass repository
  1000. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
  1001. require_compute_capability_eq(7, 5);
  1002. Checker<ConvBiasForward> checker(handle_cuda());
  1003. auto check = [&checker](const std::string& algo) {
  1004. checker.set_before_exec_callback(
  1005. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  1006. UniformIntRNG rng{-8, 8};
  1007. UniformIntRNG bias_rng{-50, 50};
  1008. UniformIntRNG const_rng{1, 1};
  1009. // use scale that are all integers to avoid rouding error
  1010. checker.set_rng(0, &rng)
  1011. .set_rng(1, &rng)
  1012. .set_rng(2, &bias_rng)
  1013. .set_rng(3, &rng)
  1014. .set_dtype(0, dtype::QuantizedS8{6.0f})
  1015. .set_dtype(1, dtype::QuantizedS8{1.0f})
  1016. .set_dtype(2, dtype::QuantizedS32{6.0f})
  1017. .set_dtype(3, dtype::QuantizedS8{1.0f})
  1018. .set_dtype(4, dtype::QuantizedS8{6.0f})
  1019. .set_epsilon(1e-3);
  1020. param::ConvBias param;
  1021. param.pad_h = param.pad_w = 1;
  1022. param.stride_h = param.stride_w = 1;
  1023. param.format = param::ConvBias::Format::NCHW32;
  1024. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1025. {512, 16, 3, 3, 32},
  1026. {1, 16, 1, 1, 32},
  1027. {},
  1028. {}});
  1029. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1030. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1031. {512, 16, 1, 1, 32},
  1032. {1, 16, 1, 1, 32},
  1033. {},
  1034. {}});
  1035. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  1036. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1037. {512, 16, 3, 3, 32},
  1038. {1, 16, 1, 1, 32},
  1039. {},
  1040. {}});
  1041. // use non integer scale
  1042. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  1043. checker.set_dtype(0, dtype::QuantizedS8{1.1f})
  1044. .set_dtype(1, dtype::QuantizedS8{1.2f})
  1045. .set_dtype(2, dtype::QuantizedS32{1.1f * 1.2f})
  1046. .set_dtype(3, dtype::QuantizedS8{1.1f})
  1047. .set_dtype(4, dtype::QuantizedS8{6.0f})
  1048. .set_epsilon(1 + 1e-3)
  1049. .set_max_avg_error(1e-1)
  1050. .set_max_avg_biased_error(1e-1)
  1051. .execs({{16, 16, 7, 7, 32},
  1052. {512, 16, 3, 3, 32},
  1053. {1, 16, 1, 1, 32},
  1054. {16, 16, 7, 7, 32},
  1055. {}});
  1056. };
  1057. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1058. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_256X128X64_64X64X64",
  1059. ConvBias::DirectParam{});
  1060. check(algo);
  1061. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1062. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X64X64_32X16X64",
  1063. ConvBias::DirectParam{});
  1064. check(algo);
  1065. }
  1066. #endif
  1067. #if MEGDNN_WITH_BENCHMARK
  1068. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) {
  1069. require_compute_capability(6, 1);
  1070. benchmark_target_algo(
  1071. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1072. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1073. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1074. param::ConvBias::Format::CHWN4);
  1075. }
  1076. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4) {
  1077. require_compute_capability(6, 1);
  1078. benchmark_target_algo(
  1079. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1080. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1081. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  1082. param::ConvBias::Format::NCHW4);
  1083. }
  1084. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE) {
  1085. require_compute_capability(7, 5);
  1086. benchmark_target_algo_with_cudnn_tsc(
  1087. handle_cuda(), get_resnet50_bench_args(256),
  1088. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1089. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1090. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1091. param::ConvBias::Format::CHWN4);
  1092. }
  1093. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE_ALL_ALGO) {
  1094. require_compute_capability(7, 5);
  1095. benchmark_target_algo_with_cudnn_tsc(
  1096. handle_cuda(), get_resnet50_bench_args(256),
  1097. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1098. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, nullptr,
  1099. param::ConvBias::Format::CHWN4);
  1100. }
  1101. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_DET_ALL_ALGO) {
  1102. require_compute_capability(7, 5);
  1103. benchmark_target_algo_with_cudnn_tsc(
  1104. handle_cuda(), get_detection_bench_args(), dtype::QuantizedS8{1.2f},
  1105. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1106. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  1107. }
  1108. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_TENSORCORE) {
  1109. require_compute_capability(7, 5);
  1110. benchmark_target_algo_with_cudnn_tsc(
  1111. handle_cuda(), get_resnet50_bench_args(256),
  1112. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1113. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1114. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1115. param::ConvBias::Format::NCHW4);
  1116. }
  1117. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
  1118. require_compute_capability(6, 1);
  1119. std::vector<BenchArgs> args;
  1120. args.push_back(BenchArgs{64, 4, 224, 224, 64, 7, 2});
  1121. benchmark_target_algo(
  1122. handle_cuda(), args, dtype::QuantizedS8{1.2f},
  1123. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1124. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1125. param::ConvBias::Format::CHWN4);
  1126. }
  1127. #if CUDA_VERSION >= 10020
  1128. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
  1129. require_compute_capability(7, 5);
  1130. benchmark_target_algo_with_cudnn_tsc(
  1131. handle_cuda(), get_resnet50_bench_args(256),
  1132. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1133. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1134. "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
  1135. param::ConvBias::Format::NCHW32);
  1136. }
  1137. #endif
  1138. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
  1139. require_compute_capability(6, 1);
  1140. benchmark_target_algo(
  1141. handle_cuda(), get_resnet50_bench_args(64),
  1142. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1143. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1144. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  1145. param::ConvBias::Format::NCHW4);
  1146. }
  1147. #endif
  1148. } // namespace test
  1149. } // namespace megdnn
  1150. #undef V1
  1151. #undef V
  1152. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台