You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_int8.cpp 55 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246
  1. /**
  2. * \file dnn/test/cuda/conv_bias_int8.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/oprs/nn.h"
  13. #include "src/common/utils.h"
  14. #include "src/cuda/cudnn_with_check.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/conv_bias.h"
  17. #include "test/cuda/benchmark.h"
  18. #include "test/cuda/fixture.h"
  19. #include "test/cuda/utils.h"
  20. #include "test/common/tensor.h"
  21. #include "test/common/workspace_wrapper.h"
  22. #define V1(x) #x
  23. #define V(x) V1(x)
  24. namespace megdnn {
  25. namespace test {
  26. namespace {
  27. #if MEGDNN_WITH_BENCHMARK
  28. struct BenchArgs {
  29. size_t n, ci, hi, wi, co, f, s;
  30. };
  31. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
  32. std::vector<BenchArgs> args;
  33. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  34. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  35. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  36. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  37. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  38. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  39. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  40. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  41. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  42. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  43. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  44. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  45. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  46. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  47. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  48. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  49. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  50. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  51. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  52. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 2});
  53. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  54. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  55. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  56. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  57. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  58. return args;
  59. }
  60. std::vector<BenchArgs> get_detection_bench_args(size_t batch = 16) {
  61. std::vector<BenchArgs> args;
  62. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  63. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  64. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  66. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  67. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  68. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  69. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  70. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  71. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  72. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  73. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  74. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  75. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  76. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  77. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  78. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  79. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  80. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  81. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  82. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  83. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  84. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  85. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  86. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  87. return args;
  88. }
  89. void benchmark_target_algo(
  90. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  91. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  92. const char* algo = nullptr,
  93. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  94. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  95. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  96. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  97. size_t RUNS = 1000;
  98. benchmarker.set_display(false).set_times(RUNS);
  99. benchmarker_cudnn.set_display(false).set_times(RUNS);
  100. #define CUDNN_VERSION_STRING \
  101. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  102. benchmarker_cudnn.set_before_exec_callback(
  103. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  104. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  105. "ALGO_IMPLICIT_PRECOMP_"
  106. "GEMM" CUDNN_VERSION_STRING));
  107. benchmarker.set_dtype(0, src_dtype)
  108. .set_dtype(1, filter_dtype)
  109. .set_dtype(2, bias_dtype)
  110. .set_dtype(3, dst_dtype)
  111. .set_dtype(4, dst_dtype);
  112. benchmarker_cudnn.set_dtype(0, src_dtype)
  113. .set_dtype(1, filter_dtype)
  114. .set_dtype(2, bias_dtype)
  115. .set_dtype(3, dst_dtype)
  116. .set_dtype(4, dst_dtype);
  117. using Param = ConvBias::Param;
  118. using Format = Param::Format;
  119. // helper function to change format
  120. auto get_tensor_shape = [](TensorShape shape,
  121. Format format) -> TensorShape {
  122. TensorShape ret;
  123. if (format == Format::NCHW4) {
  124. ret = static_cast<TensorShape>(
  125. TensorLayout{shape, dtype::Int8()}
  126. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  127. shape[3]})
  128. .dimshuffle({0, 1, 3, 4, 2}));
  129. } else if (format == Format::CHWN4) {
  130. ret = static_cast<TensorShape>(
  131. TensorLayout{shape, dtype::Int8()}
  132. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  133. shape[3]})
  134. .dimshuffle({1, 3, 4, 0, 2}));
  135. }
  136. return ret;
  137. };
  138. for (auto&& arg : args) {
  139. Param param;
  140. param.pad_h = param.pad_w = arg.f / 2;
  141. param.stride_h = param.stride_w = arg.s;
  142. param.format = format;
  143. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  144. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  145. benchmarker.set_param(param);
  146. if (!algo) {
  147. benchmarker.proxy()->target_algo = nullptr;
  148. }
  149. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  150. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  151. z{arg.n, arg.co, ho, wo}, dst = z;
  152. float time_in_ms = 0.f;
  153. if (algo) {
  154. time_in_ms =
  155. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  156. CUTimer>(benchmarker,
  157. {get_tensor_shape(src, format),
  158. get_tensor_shape(filter, format),
  159. get_tensor_shape(bias, format),
  160. {},
  161. {}},
  162. algo) /
  163. RUNS;
  164. } else {
  165. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  166. get_tensor_shape(filter, format),
  167. get_tensor_shape(bias, format),
  168. {},
  169. {}}) /
  170. RUNS;
  171. }
  172. Format format_cudnn = Format::NCHW4;
  173. param.format = format_cudnn;
  174. benchmarker_cudnn.set_param(param);
  175. auto time_in_ms_cudnn =
  176. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  177. get_tensor_shape(filter, format_cudnn),
  178. get_tensor_shape(bias, format_cudnn),
  179. {},
  180. {}}) /
  181. RUNS;
  182. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  183. (1e12);
  184. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  185. "time(cudnn)=%.2f %.2fTops, "
  186. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  187. src.to_string().c_str(), filter.to_string().c_str(),
  188. dst.to_string().c_str(), algo, time_in_ms,
  189. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  190. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  191. time_in_ms_cudnn / time_in_ms);
  192. printf("bench with z tensor\n");
  193. if (algo) {
  194. time_in_ms =
  195. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  196. CUTimer>(benchmarker,
  197. {get_tensor_shape(src, format),
  198. get_tensor_shape(filter, format),
  199. get_tensor_shape(bias, format),
  200. get_tensor_shape(z, format),
  201. {}},
  202. algo) /
  203. RUNS;
  204. } else {
  205. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  206. get_tensor_shape(filter, format),
  207. get_tensor_shape(bias, format),
  208. get_tensor_shape(z, format),
  209. {}}) /
  210. RUNS;
  211. }
  212. time_in_ms_cudnn =
  213. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  214. get_tensor_shape(filter, format_cudnn),
  215. get_tensor_shape(bias, format_cudnn),
  216. get_tensor_shape(z, format_cudnn),
  217. {}}) /
  218. RUNS;
  219. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  220. "time(cudnn)=%.2f %.2fTops, "
  221. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  222. src.to_string().c_str(), filter.to_string().c_str(),
  223. dst.to_string().c_str(), algo, time_in_ms,
  224. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  225. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  226. time_in_ms_cudnn / time_in_ms);
  227. }
  228. }
  229. void benchmark_target_algo_with_cudnn_tsc(
  230. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  231. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  232. const char* algo = nullptr,
  233. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  234. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  235. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  236. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  237. size_t RUNS = 1000;
  238. benchmarker.set_display(false).set_times(RUNS);
  239. benchmarker_cudnn.set_display(false).set_times(RUNS);
  240. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  241. new OprProxy<ConvBiasForward>{true}};
  242. if (!algo) {
  243. benchmarker.set_proxy(proxy);
  244. }
  245. benchmarker_cudnn.set_before_exec_callback(
  246. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  247. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  248. "ALGO_IMPLICIT_PRECOMP_"
  249. "GEMM" CUDNN_VERSION_STRING));
  250. #undef CUDNN_VERSION_STRING
  251. benchmarker.set_dtype(0, src_dtype)
  252. .set_dtype(1, filter_dtype)
  253. .set_dtype(2, bias_dtype)
  254. .set_dtype(3, dst_dtype)
  255. .set_dtype(4, dst_dtype);
  256. benchmarker_cudnn.set_dtype(0, src_dtype)
  257. .set_dtype(1, filter_dtype)
  258. .set_dtype(2, bias_dtype)
  259. .set_dtype(3, dst_dtype)
  260. .set_dtype(4, dst_dtype);
  261. using Param = ConvBias::Param;
  262. using Format = Param::Format;
  263. // helper function to change format
  264. auto get_tensor_shape = [](TensorShape shape,
  265. Format format) -> TensorShape {
  266. TensorShape ret;
  267. if (format == Format::NCHW4) {
  268. ret = static_cast<TensorShape>(
  269. TensorLayout{shape, dtype::Int8()}
  270. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  271. shape[3]})
  272. .dimshuffle({0, 1, 3, 4, 2}));
  273. } else if (format == Format::NCHW32) {
  274. ret = static_cast<TensorShape>(
  275. TensorLayout{shape, dtype::Int8()}
  276. .reshape({shape[0], shape[1] / 32, 32, shape[2],
  277. shape[3]})
  278. .dimshuffle({0, 1, 3, 4, 2}));
  279. } else if (format == Format::CHWN4) {
  280. ret = static_cast<TensorShape>(
  281. TensorLayout{shape, dtype::Int8()}
  282. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  283. shape[3]})
  284. .dimshuffle({1, 3, 4, 0, 2}));
  285. }
  286. return ret;
  287. };
  288. for (auto&& arg : args) {
  289. Param param;
  290. param.pad_h = param.pad_w = arg.f / 2;
  291. param.stride_h = param.stride_w = arg.s;
  292. param.format = format;
  293. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  294. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  295. benchmarker.set_param(param);
  296. if (!algo) {
  297. benchmarker.proxy()->target_algo = nullptr;
  298. }
  299. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  300. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  301. z{arg.n, arg.co, ho, wo}, dst = z;
  302. // skip testcase which cannot enable nchw32 tensorcore
  303. if (format == Format::NCHW32 && (arg.co % 32 != 0 || arg.ci % 32 != 0))
  304. continue;
  305. // skip testcase which cannot enable nchw4/chwn4 tensorcore
  306. if ((format == Format::CHWN4 || format == Format::NCHW4) &&
  307. (arg.ci % 16 != 0))
  308. continue;
  309. Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
  310. ? Format::NCHW32
  311. : Format::NCHW4;
  312. param.format = format_cudnn;
  313. benchmarker_cudnn.set_param(param);
  314. float time_in_ms = 0.f;
  315. if (algo) {
  316. time_in_ms =
  317. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  318. CUTimer>(benchmarker,
  319. {get_tensor_shape(src, format),
  320. get_tensor_shape(filter, format),
  321. get_tensor_shape(bias, format),
  322. {},
  323. {}},
  324. algo) /
  325. RUNS;
  326. } else {
  327. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  328. get_tensor_shape(filter, format),
  329. get_tensor_shape(bias, format),
  330. {},
  331. {}}) /
  332. RUNS;
  333. }
  334. float time_in_ms_cudnn =
  335. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  336. get_tensor_shape(filter, format_cudnn),
  337. get_tensor_shape(bias, format_cudnn),
  338. {},
  339. {}}) /
  340. RUNS;
  341. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  342. (1e12);
  343. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  344. "time(cudnn)=%.2f %.2fTops, "
  345. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  346. src.to_string().c_str(), filter.to_string().c_str(),
  347. dst.to_string().c_str(), algo, time_in_ms,
  348. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  349. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  350. time_in_ms_cudnn / time_in_ms);
  351. printf("bench with z tensor\n");
  352. if (algo) {
  353. time_in_ms =
  354. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  355. CUTimer>(benchmarker,
  356. {get_tensor_shape(src, format),
  357. get_tensor_shape(filter, format),
  358. get_tensor_shape(bias, format),
  359. get_tensor_shape(z, format),
  360. {}},
  361. algo) /
  362. RUNS;
  363. } else {
  364. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  365. get_tensor_shape(filter, format),
  366. get_tensor_shape(bias, format),
  367. get_tensor_shape(z, format),
  368. {}}) /
  369. RUNS;
  370. }
  371. time_in_ms_cudnn =
  372. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  373. get_tensor_shape(filter, format_cudnn),
  374. get_tensor_shape(bias, format_cudnn),
  375. get_tensor_shape(z, format_cudnn),
  376. {}}) /
  377. RUNS;
  378. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  379. "time(cudnn)=%.2f %.2fTops, "
  380. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  381. src.to_string().c_str(), filter.to_string().c_str(),
  382. dst.to_string().c_str(), algo, time_in_ms,
  383. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  384. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  385. time_in_ms_cudnn / time_in_ms);
  386. }
  387. }
  388. #endif
  389. } // namespace
  390. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) {
  391. require_compute_capability(6, 1);
  392. conv_bias::check_conv_bias(
  393. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  394. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  395. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  396. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(1));
  397. }
  398. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_3x3) {
  399. require_compute_capability(6, 1);
  400. conv_bias::check_conv_bias(
  401. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  402. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  403. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  404. param::ConvBias::Format::NCHW4);
  405. }
  406. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_5x5) {
  407. require_compute_capability(6, 1);
  408. conv_bias::check_conv_bias(
  409. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  410. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  411. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  412. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(5));
  413. }
  414. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_7x7) {
  415. require_compute_capability(6, 1);
  416. conv_bias::check_conv_bias(
  417. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  418. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  419. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  420. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(7));
  421. }
  422. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_WITH_Z) {
  423. require_compute_capability(6, 1);
  424. Checker<ConvBiasForward> checker(handle_cuda());
  425. checker.set_before_exec_callback(
  426. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  427. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  428. UniformIntRNG rng{-3, 3};
  429. UniformIntRNG bias_rng{-50, 50};
  430. checker.set_rng(0, &rng)
  431. .set_rng(1, &rng)
  432. .set_rng(2, &bias_rng)
  433. .set_rng(3, &rng)
  434. .set_dtype(0, dtype::QuantizedS8{1.2f})
  435. .set_dtype(1, dtype::QuantizedS8{1.3f})
  436. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  437. .set_dtype(3, dtype::QuantizedS8{1.1f})
  438. .set_dtype(4, dtype::QuantizedS8{1.0f})
  439. .set_epsilon(1 + 1e-3)
  440. .set_max_avg_error(1e-1)
  441. .set_max_avg_biased_error(1e-1);
  442. param::ConvBias param;
  443. param.pad_h = param.pad_w = 1;
  444. param.stride_h = param.stride_w = 1;
  445. param.format = param::ConvBias::Format::NCHW4;
  446. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  447. {16, 4, 3, 3, 4},
  448. {1, 4, 1, 1, 4},
  449. {32, 4, 12, 12, 4},
  450. {}});
  451. }
  452. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_STRIDE2_WITH_Z) {
  453. require_compute_capability(6, 1);
  454. Checker<ConvBiasForward> checker(handle_cuda());
  455. checker.set_before_exec_callback(
  456. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  457. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  458. UniformIntRNG rng{-3, 3};
  459. UniformIntRNG bias_rng{-50, 50};
  460. checker.set_rng(0, &rng)
  461. .set_rng(1, &rng)
  462. .set_rng(2, &bias_rng)
  463. .set_rng(3, &rng)
  464. .set_dtype(0, dtype::QuantizedS8{1.2f})
  465. .set_dtype(1, dtype::QuantizedS8{1.3f})
  466. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  467. .set_dtype(3, dtype::QuantizedS8{1.1f})
  468. .set_dtype(4, dtype::QuantizedS8{1.0f})
  469. .set_epsilon(1 + 1e-3)
  470. .set_max_avg_error(1e-1)
  471. .set_max_avg_biased_error(1e-1);
  472. param::ConvBias param;
  473. param.pad_h = param.pad_w = 1;
  474. param.stride_h = param.stride_w = 2;
  475. param.format = param::ConvBias::Format::NCHW4;
  476. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  477. {16, 4, 3, 3, 4},
  478. {1, 4, 1, 1, 4},
  479. {32, 4, 6, 6, 4},
  480. {}});
  481. }
  482. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_1x1) {
  483. require_compute_capability(6, 1);
  484. conv_bias::check_conv_bias(
  485. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  486. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  487. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  488. param::ConvBias::Format::NCHW4,
  489. conv_bias::get_int8_nchw4_args_check_bounds(1));
  490. }
  491. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_3x3) {
  492. require_compute_capability(6, 1);
  493. conv_bias::check_conv_bias(
  494. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  495. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  496. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  497. param::ConvBias::Format::NCHW4,
  498. conv_bias::get_int8_nchw4_args_check_bounds(3));
  499. }
  500. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_5x5) {
  501. require_compute_capability(6, 1);
  502. conv_bias::check_conv_bias(
  503. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  504. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  505. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  506. param::ConvBias::Format::NCHW4,
  507. conv_bias::get_int8_nchw4_args_check_bounds(5));
  508. }
  509. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_7x7) {
  510. require_compute_capability(6, 1);
  511. conv_bias::check_conv_bias(
  512. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  513. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  514. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  515. param::ConvBias::Format::NCHW4,
  516. conv_bias::get_int8_nchw4_args_check_bounds(7));
  517. }
  518. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4) {
  519. require_compute_capability(6, 1);
  520. conv_bias::check_conv_bias(
  521. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  522. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  523. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  524. param::ConvBias::Format::CHWN4);
  525. }
  526. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_WITH_Z) {
  527. require_compute_capability(6, 1);
  528. Checker<ConvBiasForward> checker(handle_cuda());
  529. checker.set_before_exec_callback(
  530. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  531. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  532. UniformIntRNG rng{-3, 3};
  533. UniformIntRNG bias_rng{-50, 50};
  534. checker.set_rng(0, &rng)
  535. .set_rng(1, &rng)
  536. .set_rng(2, &bias_rng)
  537. .set_rng(3, &rng)
  538. .set_dtype(0, dtype::QuantizedS8{1.2f})
  539. .set_dtype(1, dtype::QuantizedS8{1.3f})
  540. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  541. .set_dtype(3, dtype::QuantizedS8{1.1f})
  542. .set_dtype(4, dtype::QuantizedS8{1.1f})
  543. .set_epsilon(1 + 1e-3)
  544. .set_max_avg_error(1e-1)
  545. .set_max_avg_biased_error(1e-1);
  546. param::ConvBias param;
  547. param.pad_h = param.pad_w = 1;
  548. param.stride_h = param.stride_w = 1;
  549. param.format = param::ConvBias::Format::CHWN4;
  550. checker.set_param(param).execs({{4, 12, 12, 32, 4},
  551. {4, 3, 3, 16, 4},
  552. {4, 1, 1, 1, 4},
  553. {4, 12, 12, 32, 4},
  554. {}});
  555. }
  556. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
  557. require_compute_capability(6, 1);
  558. Checker<ConvBiasForward> checker(handle_cuda());
  559. checker.set_before_exec_callback(
  560. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  561. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  562. UniformIntRNG rng{-3, 3};
  563. UniformIntRNG bias_rng{-50, 50};
  564. checker.set_rng(0, &rng)
  565. .set_rng(1, &rng)
  566. .set_rng(2, &bias_rng)
  567. .set_rng(3, &rng)
  568. .set_dtype(0, dtype::QuantizedS8{1.2f})
  569. .set_dtype(1, dtype::QuantizedS8{1.3f})
  570. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  571. .set_dtype(4, dtype::QuantizedS8{0.001f})
  572. .set_epsilon(1 + 1e-3)
  573. .set_max_avg_error(1e-1)
  574. .set_max_avg_biased_error(1e-1);
  575. param::ConvBias param;
  576. param.pad_h = param.pad_w = 1;
  577. param.stride_h = param.stride_w = 1;
  578. param.format = param::ConvBias::Format::CHWN4;
  579. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  580. checker.set_param(param).execs(
  581. {{4, 12, 12, 32, 4}, {4, 3, 3, 16, 4}, {4, 1, 1, 1, 4}, {}, {}});
  582. }
  583. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
  584. require_compute_capability(6, 1);
  585. conv_bias::check_conv_bias(
  586. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  587. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  588. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  589. param::ConvBias::Format::CHWN4,
  590. conv_bias::get_int8_chwn4_args_check_bounds(3));
  591. }
  592. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1) {
  593. require_compute_capability(6, 1);
  594. conv_bias::check_conv_bias(
  595. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  596. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  597. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  598. param::ConvBias::Format::CHWN4,
  599. conv_bias::get_int8_chwn4_small_channel_args(1));
  600. }
  601. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_3x3) {
  602. require_compute_capability(6, 1);
  603. conv_bias::check_conv_bias(
  604. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  605. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  606. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  607. param::ConvBias::Format::CHWN4,
  608. conv_bias::get_int8_chwn4_small_channel_args(3));
  609. }
  610. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5) {
  611. require_compute_capability(6, 1);
  612. conv_bias::check_conv_bias(
  613. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  614. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  615. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  616. param::ConvBias::Format::CHWN4,
  617. conv_bias::get_int8_chwn4_small_channel_args(5));
  618. }
  619. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7) {
  620. require_compute_capability(6, 1);
  621. conv_bias::check_conv_bias(
  622. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  623. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  624. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  625. param::ConvBias::Format::CHWN4,
  626. conv_bias::get_int8_chwn4_small_channel_args(7));
  627. }
  628. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_SMALL_CHANNEL_CHECK_BOUNDS) {
  629. require_compute_capability(6, 1);
  630. conv_bias::check_conv_bias(
  631. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  632. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  633. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  634. param::ConvBias::Format::NCHW4,
  635. conv_bias::get_int8_nchw4_small_channel_args_check_bounds(3));
  636. }
  637. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1_CHECK_BOUNDS) {
  638. require_compute_capability(6, 1);
  639. conv_bias::check_conv_bias(
  640. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  641. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  642. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  643. param::ConvBias::Format::CHWN4,
  644. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(1));
  645. }
  646. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5_CHECK_BOUNDS) {
  647. require_compute_capability(6, 1);
  648. conv_bias::check_conv_bias(
  649. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  650. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  651. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  652. param::ConvBias::Format::CHWN4,
  653. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(5));
  654. }
  655. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7_CHECK_BOUNDS) {
  656. require_compute_capability(6, 1);
  657. conv_bias::check_conv_bias(
  658. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  659. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  660. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  661. param::ConvBias::Format::CHWN4,
  662. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(7));
  663. }
  664. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_1x1) {
  665. require_compute_capability(7, 5);
  666. conv_bias::check_conv_bias(
  667. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  668. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  669. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  670. param::ConvBias::Format::NCHW4,
  671. conv_bias::get_int8_nchw4_tensorcore_args(1));
  672. }
  673. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_3x3) {
  674. require_compute_capability(7, 5);
  675. conv_bias::check_conv_bias(
  676. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  677. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  678. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  679. param::ConvBias::Format::NCHW4,
  680. conv_bias::get_int8_nchw4_tensorcore_args(3));
  681. }
  682. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_5x5) {
  683. require_compute_capability(7, 5);
  684. conv_bias::check_conv_bias(
  685. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  686. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  687. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  688. param::ConvBias::Format::NCHW4,
  689. conv_bias::get_int8_nchw4_tensorcore_args(5));
  690. }
  691. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_7x7) {
  692. require_compute_capability(7, 5);
  693. conv_bias::check_conv_bias(
  694. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  695. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  696. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  697. param::ConvBias::Format::NCHW4,
  698. conv_bias::get_int8_nchw4_tensorcore_args(7));
  699. }
  700. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  701. require_compute_capability(7, 5);
  702. conv_bias::check_conv_bias(
  703. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  704. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  705. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  706. param::ConvBias::Format::NCHW4,
  707. conv_bias::get_int8_nchw4_args_check_bounds(3));
  708. }
  709. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  710. require_compute_capability(7, 5);
  711. conv_bias::check_conv_bias(
  712. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  713. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  714. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  715. param::ConvBias::Format::NCHW4,
  716. conv_bias::get_int8_nchw4_args_check_bounds(3));
  717. }
  718. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  719. require_compute_capability(7, 5);
  720. conv_bias::check_conv_bias(
  721. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  722. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  723. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  724. param::ConvBias::Format::NCHW4,
  725. conv_bias::get_int8_nchw4_args_check_bounds(3));
  726. }
  727. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_0) {
  728. require_compute_capability(7, 5);
  729. conv_bias::check_conv_bias(
  730. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  731. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  732. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  733. param::ConvBias::Format::CHWN4,
  734. conv_bias::get_int8_chwn4_tensorcore_args(3));
  735. }
  736. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_1) {
  737. require_compute_capability(7, 5);
  738. conv_bias::check_conv_bias(
  739. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  740. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  741. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  742. param::ConvBias::Format::CHWN4,
  743. conv_bias::get_int8_chwn4_tensorcore_args(3));
  744. }
  745. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_2) {
  746. require_compute_capability(7, 5);
  747. conv_bias::check_conv_bias(
  748. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  749. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  750. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  751. param::ConvBias::Format::CHWN4,
  752. conv_bias::get_int8_chwn4_tensorcore_args(3));
  753. }
  754. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_1x1) {
  755. require_compute_capability(7, 5);
  756. conv_bias::check_conv_bias(
  757. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  758. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  759. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  760. param::ConvBias::Format::CHWN4,
  761. conv_bias::get_int8_chwn4_args_check_bounds(1));
  762. }
  763. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_5x5) {
  764. require_compute_capability(7, 5);
  765. conv_bias::check_conv_bias(
  766. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  767. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  768. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  769. param::ConvBias::Format::CHWN4,
  770. conv_bias::get_int8_chwn4_args_check_bounds(5));
  771. }
  772. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_7x7) {
  773. require_compute_capability(7, 5);
  774. conv_bias::check_conv_bias(
  775. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  776. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  777. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  778. param::ConvBias::Format::CHWN4,
  779. conv_bias::get_int8_chwn4_args_check_bounds(7));
  780. }
  781. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_WITH_Z) {
  782. require_compute_capability(7, 5);
  783. Checker<ConvBiasForward> checker(handle_cuda());
  784. checker.set_before_exec_callback(
  785. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  786. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  787. UniformIntRNG rng{-3, 3};
  788. UniformIntRNG bias_rng{-50, 50};
  789. checker.set_rng(0, &rng)
  790. .set_rng(1, &rng)
  791. .set_rng(2, &bias_rng)
  792. .set_rng(3, &rng)
  793. .set_dtype(0, dtype::QuantizedS8{1.2f})
  794. .set_dtype(1, dtype::QuantizedS8{1.3f})
  795. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  796. .set_dtype(3, dtype::QuantizedS8{1.1f})
  797. .set_dtype(4, dtype::QuantizedS8{1.0f})
  798. .set_epsilon(1 + 1e-3)
  799. .set_max_avg_error(1e-1)
  800. .set_max_avg_biased_error(1e-1);
  801. param::ConvBias param;
  802. param.pad_h = param.pad_w = 1;
  803. param.stride_h = param.stride_w = 1;
  804. param.format = param::ConvBias::Format::NCHW4;
  805. checker.set_param(param).execs({{64, 8, 12, 12, 4},
  806. {64, 8, 3, 3, 4},
  807. {1, 16, 1, 1, 4},
  808. {64, 16, 12, 12, 4},
  809. {}});
  810. }
  811. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_WITH_Z) {
  812. require_compute_capability(7, 5);
  813. Checker<ConvBiasForward> checker(handle_cuda());
  814. checker.set_before_exec_callback(
  815. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  816. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  817. UniformIntRNG rng{-3, 3};
  818. UniformIntRNG bias_rng{-50, 50};
  819. checker.set_rng(0, &rng)
  820. .set_rng(1, &rng)
  821. .set_rng(2, &bias_rng)
  822. .set_rng(3, &rng)
  823. .set_dtype(0, dtype::QuantizedS8{1.2f})
  824. .set_dtype(1, dtype::QuantizedS8{1.3f})
  825. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  826. .set_dtype(3, dtype::QuantizedS8{1.1f})
  827. .set_dtype(4, dtype::QuantizedS8{1.0f})
  828. .set_epsilon(1 + 1e-3)
  829. .set_max_avg_error(1e-1)
  830. .set_max_avg_biased_error(1e-1);
  831. param::ConvBias param;
  832. param.pad_h = param.pad_w = 1;
  833. param.stride_h = param.stride_w = 1;
  834. param.format = param::ConvBias::Format::CHWN4;
  835. checker.set_param(param).execs({{8, 12, 12, 64, 4},
  836. {8, 3, 3, 64, 4},
  837. {16, 1, 1, 1, 4},
  838. {16, 12, 12, 64, 4},
  839. {}});
  840. }
  841. TEST_F(CUDA,
  842. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  843. require_compute_capability(7, 5);
  844. conv_bias::check_conv_bias(
  845. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  846. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  847. handle_cuda(),
  848. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  849. param::ConvBias::Format::CHWN4,
  850. conv_bias::get_int8_chwn4_args_check_bounds(3));
  851. }
  852. TEST_F(CUDA,
  853. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  854. require_compute_capability(7, 5);
  855. conv_bias::check_conv_bias(
  856. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  857. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  858. handle_cuda(),
  859. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  860. param::ConvBias::Format::CHWN4,
  861. conv_bias::get_int8_chwn4_args_check_bounds(3));
  862. }
  863. TEST_F(CUDA,
  864. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  865. require_compute_capability(7, 5);
  866. conv_bias::check_conv_bias(
  867. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  868. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  869. handle_cuda(),
  870. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  871. param::ConvBias::Format::CHWN4,
  872. conv_bias::get_int8_chwn4_args_check_bounds(3));
  873. }
  874. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_0) {
  875. require_compute_capability(7, 5);
  876. conv_bias::check_conv_bias(
  877. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  878. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  879. handle_cuda(),
  880. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  881. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  882. }
  883. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_1) {
  884. require_compute_capability(7, 5);
  885. conv_bias::check_conv_bias(
  886. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  887. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  888. handle_cuda(),
  889. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  890. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  891. }
  892. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_2) {
  893. require_compute_capability(7, 5);
  894. conv_bias::check_conv_bias(
  895. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  896. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  897. handle_cuda(),
  898. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  899. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  900. }
  901. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_0) {
  902. require_compute_capability(7, 5);
  903. conv_bias::check_conv_bias(
  904. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  905. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  906. handle_cuda(),
  907. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  908. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  909. }
  910. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_1) {
  911. require_compute_capability(7, 5);
  912. conv_bias::check_conv_bias(
  913. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  914. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  915. handle_cuda(),
  916. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  917. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  918. }
  919. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_2) {
  920. require_compute_capability(7, 5);
  921. conv_bias::check_conv_bias(
  922. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  923. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  924. handle_cuda(),
  925. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  926. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  927. }
  928. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1) {
  929. require_compute_capability(7, 5);
  930. conv_bias::check_conv_bias(
  931. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  932. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  933. handle_cuda(),
  934. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  935. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(1));
  936. }
  937. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5) {
  938. require_compute_capability(7, 5);
  939. conv_bias::check_conv_bias(
  940. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  941. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  942. handle_cuda(),
  943. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  944. param::ConvBias::Format::CHWN4,
  945. conv_bias::get_int8_chwn4_args_small_batch(5));
  946. }
  947. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_7x7) {
  948. require_compute_capability(7, 5);
  949. conv_bias::check_conv_bias(
  950. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  951. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  952. handle_cuda(),
  953. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  954. param::ConvBias::Format::CHWN4,
  955. conv_bias::get_int8_chwn4_args_small_batch(7));
  956. }
  957. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_1) {
  958. require_compute_capability(7, 5);
  959. conv_bias::check_conv_bias(
  960. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  961. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  962. handle_cuda(),
  963. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  964. param::ConvBias::Format::CHWN4,
  965. conv_bias::get_int8_chwn4_args_small_batch(5));
  966. }
  967. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_2) {
  968. require_compute_capability(7, 5);
  969. conv_bias::check_conv_bias(
  970. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  971. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  972. handle_cuda(),
  973. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  974. param::ConvBias::Format::CHWN4,
  975. conv_bias::get_int8_chwn4_args_small_batch(5));
  976. }
  977. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_1) {
  978. require_compute_capability(7, 5);
  979. conv_bias::check_conv_bias(
  980. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  981. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  982. handle_cuda(),
  983. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  984. param::ConvBias::Format::CHWN4,
  985. conv_bias::get_int8_chwn4_args_small_batch(1));
  986. }
  987. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
  988. require_compute_capability(7, 5);
  989. conv_bias::check_conv_bias(
  990. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  991. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  992. handle_cuda(),
  993. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  994. param::ConvBias::Format::CHWN4,
  995. conv_bias::get_int8_chwn4_args_small_batch(1));
  996. }
  997. #if CUDA_VERSION >= 10020
  998. /// \note: we only check several cases and block sizes in megdnn_test, the
  999. /// full testcases are written in cutlass repository
  1000. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
  1001. require_compute_capability_eq(7, 5);
  1002. Checker<ConvBiasForward> checker(handle_cuda());
  1003. auto check = [&checker](const std::string& algo) {
  1004. checker.set_before_exec_callback(
  1005. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  1006. UniformIntRNG rng{-8, 8};
  1007. UniformIntRNG bias_rng{-50, 50};
  1008. UniformIntRNG const_rng{1, 1};
  1009. // use scale that are all integers to avoid rouding error
  1010. checker.set_rng(0, &rng)
  1011. .set_rng(1, &rng)
  1012. .set_rng(2, &bias_rng)
  1013. .set_rng(3, &rng)
  1014. .set_dtype(0, dtype::QuantizedS8{6.0f})
  1015. .set_dtype(1, dtype::QuantizedS8{1.0f})
  1016. .set_dtype(2, dtype::QuantizedS32{6.0f})
  1017. .set_dtype(3, dtype::QuantizedS8{1.0f})
  1018. .set_dtype(4, dtype::QuantizedS8{6.0f})
  1019. .set_epsilon(1e-3);
  1020. param::ConvBias param;
  1021. param.pad_h = param.pad_w = 1;
  1022. param.stride_h = param.stride_w = 1;
  1023. param.format = param::ConvBias::Format::NCHW32;
  1024. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1025. {512, 16, 3, 3, 32},
  1026. {1, 16, 1, 1, 32},
  1027. {},
  1028. {}});
  1029. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1030. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1031. {512, 16, 1, 1, 32},
  1032. {1, 16, 1, 1, 32},
  1033. {},
  1034. {}});
  1035. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  1036. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1037. {512, 16, 3, 3, 32},
  1038. {1, 16, 1, 1, 32},
  1039. {},
  1040. {}});
  1041. // use non integer scale
  1042. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  1043. checker.set_dtype(0, dtype::QuantizedS8{1.1f})
  1044. .set_dtype(1, dtype::QuantizedS8{1.2f})
  1045. .set_dtype(2, dtype::QuantizedS32{1.1f * 1.2f})
  1046. .set_dtype(3, dtype::QuantizedS8{1.1f})
  1047. .set_dtype(4, dtype::QuantizedS8{6.0f})
  1048. .set_epsilon(1 + 1e-3)
  1049. .set_max_avg_error(1e-1)
  1050. .set_max_avg_biased_error(1e-1)
  1051. .execs({{16, 16, 7, 7, 32},
  1052. {512, 16, 3, 3, 32},
  1053. {1, 16, 1, 1, 32},
  1054. {16, 16, 7, 7, 32},
  1055. {}});
  1056. };
  1057. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1058. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_256X128X64_64X64X64",
  1059. ConvBias::DirectParam{});
  1060. check(algo);
  1061. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1062. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X64X64_32X16X64",
  1063. ConvBias::DirectParam{});
  1064. check(algo);
  1065. }
  1066. #endif
  1067. #if MEGDNN_WITH_BENCHMARK
  1068. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) {
  1069. require_compute_capability(6, 1);
  1070. benchmark_target_algo(
  1071. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1072. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1073. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1074. param::ConvBias::Format::CHWN4);
  1075. }
  1076. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4) {
  1077. require_compute_capability(6, 1);
  1078. benchmark_target_algo(
  1079. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1080. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1081. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  1082. param::ConvBias::Format::NCHW4);
  1083. }
  1084. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE) {
  1085. require_compute_capability(7, 5);
  1086. benchmark_target_algo_with_cudnn_tsc(
  1087. handle_cuda(), get_resnet50_bench_args(256),
  1088. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1089. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1090. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1091. param::ConvBias::Format::CHWN4);
  1092. }
  1093. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE_ALL_ALGO) {
  1094. require_compute_capability(7, 5);
  1095. benchmark_target_algo_with_cudnn_tsc(
  1096. handle_cuda(), get_resnet50_bench_args(256),
  1097. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1098. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, nullptr,
  1099. param::ConvBias::Format::CHWN4);
  1100. }
  1101. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_DET_ALL_ALGO) {
  1102. require_compute_capability(7, 5);
  1103. benchmark_target_algo_with_cudnn_tsc(
  1104. handle_cuda(), get_detection_bench_args(), dtype::QuantizedS8{1.2f},
  1105. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1106. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  1107. }
  1108. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_TENSORCORE) {
  1109. require_compute_capability(7, 5);
  1110. benchmark_target_algo_with_cudnn_tsc(
  1111. handle_cuda(), get_resnet50_bench_args(256),
  1112. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1113. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1114. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1115. param::ConvBias::Format::NCHW4);
  1116. }
  1117. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
  1118. require_compute_capability(6, 1);
  1119. std::vector<BenchArgs> args;
  1120. args.push_back(BenchArgs{64, 4, 224, 224, 64, 7, 2});
  1121. benchmark_target_algo(
  1122. handle_cuda(), args, dtype::QuantizedS8{1.2f},
  1123. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1124. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1125. param::ConvBias::Format::CHWN4);
  1126. }
  1127. #if CUDA_VERSION >= 10020
  1128. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
  1129. require_compute_capability(7, 5);
  1130. benchmark_target_algo_with_cudnn_tsc(
  1131. handle_cuda(), get_resnet50_bench_args(256),
  1132. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1133. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1134. "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
  1135. param::ConvBias::Format::NCHW32);
  1136. }
  1137. #endif
  1138. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
  1139. require_compute_capability(6, 1);
  1140. benchmark_target_algo(
  1141. handle_cuda(), get_resnet50_bench_args(64),
  1142. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1143. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1144. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4);
  1145. }
  1146. #endif
  1147. } // namespace test
  1148. } // namespace megdnn
  1149. #undef V1
  1150. #undef V
  1151. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台