You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_int8.cpp 55 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243
  1. /**
  2. * \file dnn/test/cuda/conv_bias_int8.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/oprs/nn.h"
  13. #include "src/common/utils.h"
  14. #include "src/cuda/cudnn_with_check.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/conv_bias.h"
  17. #include "test/cuda/benchmark.h"
  18. #include "test/cuda/fixture.h"
  19. #include "test/cuda/utils.h"
  20. #define V1(x) #x
  21. #define V(x) V1(x)
  22. namespace megdnn {
  23. namespace test {
  24. namespace {
  25. #if MEGDNN_WITH_BENCHMARK
  26. struct BenchArgs {
  27. size_t n, ci, hi, wi, co, f, s;
  28. };
  29. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
  30. std::vector<BenchArgs> args;
  31. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  32. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  33. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  34. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  35. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  36. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  37. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  38. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  39. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  40. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  41. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  42. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  43. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  44. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  45. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  46. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  47. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  48. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  49. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  50. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  51. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  52. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  53. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  54. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  55. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  56. return args;
  57. }
  58. std::vector<BenchArgs> get_detection_bench_args(size_t batch = 16) {
  59. std::vector<BenchArgs> args;
  60. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  61. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  62. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  63. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  64. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  66. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  67. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  68. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  69. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  70. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  71. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  72. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  73. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  74. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  75. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  76. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  77. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  78. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  79. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  80. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  81. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  82. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  83. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  84. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  85. return args;
  86. }
  87. void benchmark_target_algo(
  88. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  89. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  90. const char* algo = nullptr,
  91. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  92. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  93. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  94. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  95. size_t RUNS = 1000;
  96. benchmarker.set_display(false).set_times(RUNS);
  97. benchmarker_cudnn.set_display(false).set_times(RUNS);
  98. #define CUDNN_VERSION_STRING \
  99. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  100. benchmarker_cudnn.set_before_exec_callback(
  101. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  102. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  103. "ALGO_IMPLICIT_PRECOMP_"
  104. "GEMM" CUDNN_VERSION_STRING));
  105. benchmarker.set_dtype(0, src_dtype)
  106. .set_dtype(1, filter_dtype)
  107. .set_dtype(2, bias_dtype)
  108. .set_dtype(3, dst_dtype)
  109. .set_dtype(4, dst_dtype);
  110. benchmarker_cudnn.set_dtype(0, src_dtype)
  111. .set_dtype(1, filter_dtype)
  112. .set_dtype(2, bias_dtype)
  113. .set_dtype(3, dst_dtype)
  114. .set_dtype(4, dst_dtype);
  115. using Param = ConvBias::Param;
  116. using Format = Param::Format;
  117. // helper function to change format
  118. auto get_tensor_shape = [](TensorShape shape,
  119. Format format) -> TensorShape {
  120. TensorShape ret;
  121. if (format == Format::NCHW4) {
  122. ret = static_cast<TensorShape>(
  123. TensorLayout{shape, dtype::Int8()}
  124. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  125. shape[3]})
  126. .dimshuffle({0, 1, 3, 4, 2}));
  127. } else if (format == Format::CHWN4) {
  128. ret = static_cast<TensorShape>(
  129. TensorLayout{shape, dtype::Int8()}
  130. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  131. shape[3]})
  132. .dimshuffle({1, 3, 4, 0, 2}));
  133. }
  134. return ret;
  135. };
  136. for (auto&& arg : args) {
  137. Param param;
  138. param.pad_h = param.pad_w = arg.f / 2;
  139. param.stride_h = param.stride_w = arg.s;
  140. param.format = format;
  141. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  142. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  143. benchmarker.set_param(param);
  144. if (!algo) {
  145. benchmarker.proxy()->target_algo = nullptr;
  146. }
  147. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  148. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  149. z{arg.n, arg.co, ho, wo}, dst = z;
  150. float time_in_ms = 0.f;
  151. if (algo) {
  152. time_in_ms =
  153. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  154. CUTimer>(benchmarker,
  155. {get_tensor_shape(src, format),
  156. get_tensor_shape(filter, format),
  157. get_tensor_shape(bias, format),
  158. {},
  159. {}},
  160. algo) /
  161. RUNS;
  162. } else {
  163. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  164. get_tensor_shape(filter, format),
  165. get_tensor_shape(bias, format),
  166. {},
  167. {}}) /
  168. RUNS;
  169. }
  170. Format format_cudnn = Format::NCHW4;
  171. param.format = format_cudnn;
  172. benchmarker_cudnn.set_param(param);
  173. auto time_in_ms_cudnn =
  174. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  175. get_tensor_shape(filter, format_cudnn),
  176. get_tensor_shape(bias, format_cudnn),
  177. {},
  178. {}}) /
  179. RUNS;
  180. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  181. (1e12);
  182. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  183. "time(cudnn)=%.2f %.2fTops, "
  184. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  185. src.to_string().c_str(), filter.to_string().c_str(),
  186. dst.to_string().c_str(), algo, time_in_ms,
  187. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  188. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  189. time_in_ms_cudnn / time_in_ms);
  190. printf("bench with z tensor\n");
  191. if (algo) {
  192. time_in_ms =
  193. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  194. CUTimer>(benchmarker,
  195. {get_tensor_shape(src, format),
  196. get_tensor_shape(filter, format),
  197. get_tensor_shape(bias, format),
  198. get_tensor_shape(z, format),
  199. {}},
  200. algo) /
  201. RUNS;
  202. } else {
  203. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  204. get_tensor_shape(filter, format),
  205. get_tensor_shape(bias, format),
  206. get_tensor_shape(z, format),
  207. {}}) /
  208. RUNS;
  209. }
  210. time_in_ms_cudnn =
  211. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  212. get_tensor_shape(filter, format_cudnn),
  213. get_tensor_shape(bias, format_cudnn),
  214. get_tensor_shape(z, format_cudnn),
  215. {}}) /
  216. RUNS;
  217. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  218. "time(cudnn)=%.2f %.2fTops, "
  219. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  220. src.to_string().c_str(), filter.to_string().c_str(),
  221. dst.to_string().c_str(), algo, time_in_ms,
  222. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  223. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  224. time_in_ms_cudnn / time_in_ms);
  225. }
  226. }
  227. void benchmark_target_algo_with_cudnn_tsc(
  228. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  229. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  230. const char* algo = nullptr,
  231. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  232. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  233. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  234. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  235. size_t RUNS = 1000;
  236. benchmarker.set_display(false).set_times(RUNS);
  237. benchmarker_cudnn.set_display(false).set_times(RUNS);
  238. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  239. new OprProxy<ConvBiasForward>{true}};
  240. if (!algo) {
  241. benchmarker.set_proxy(proxy);
  242. }
  243. benchmarker_cudnn.set_before_exec_callback(
  244. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  245. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  246. "ALGO_IMPLICIT_PRECOMP_"
  247. "GEMM" CUDNN_VERSION_STRING));
  248. #undef CUDNN_VERSION_STRING
  249. benchmarker.set_dtype(0, src_dtype)
  250. .set_dtype(1, filter_dtype)
  251. .set_dtype(2, bias_dtype)
  252. .set_dtype(3, dst_dtype)
  253. .set_dtype(4, dst_dtype);
  254. benchmarker_cudnn.set_dtype(0, src_dtype)
  255. .set_dtype(1, filter_dtype)
  256. .set_dtype(2, bias_dtype)
  257. .set_dtype(3, dst_dtype)
  258. .set_dtype(4, dst_dtype);
  259. using Param = ConvBias::Param;
  260. using Format = Param::Format;
  261. // helper function to change format
  262. auto get_tensor_shape = [](TensorShape shape,
  263. Format format) -> TensorShape {
  264. TensorShape ret;
  265. if (format == Format::NCHW4) {
  266. ret = static_cast<TensorShape>(
  267. TensorLayout{shape, dtype::Int8()}
  268. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  269. shape[3]})
  270. .dimshuffle({0, 1, 3, 4, 2}));
  271. } else if (format == Format::NCHW32) {
  272. ret = static_cast<TensorShape>(
  273. TensorLayout{shape, dtype::Int8()}
  274. .reshape({shape[0], shape[1] / 32, 32, shape[2],
  275. shape[3]})
  276. .dimshuffle({0, 1, 3, 4, 2}));
  277. } else if (format == Format::CHWN4) {
  278. ret = static_cast<TensorShape>(
  279. TensorLayout{shape, dtype::Int8()}
  280. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  281. shape[3]})
  282. .dimshuffle({1, 3, 4, 0, 2}));
  283. }
  284. return ret;
  285. };
  286. for (auto&& arg : args) {
  287. Param param;
  288. param.pad_h = param.pad_w = arg.f / 2;
  289. param.stride_h = param.stride_w = arg.s;
  290. param.format = format;
  291. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  292. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  293. benchmarker.set_param(param);
  294. if (!algo) {
  295. benchmarker.proxy()->target_algo = nullptr;
  296. }
  297. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  298. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  299. z{arg.n, arg.co, ho, wo}, dst = z;
  300. // skip testcase which cannot enable nchw32 tensorcore
  301. if (format == Format::NCHW32 && (arg.co % 32 != 0 || arg.ci % 32 != 0))
  302. continue;
  303. // skip testcase which cannot enable nchw4/chwn4 tensorcore
  304. if ((format == Format::CHWN4 || format == Format::NCHW4) &&
  305. (arg.ci % 16 != 0))
  306. continue;
  307. float time_in_ms = 0.f;
  308. if (algo) {
  309. time_in_ms =
  310. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  311. CUTimer>(benchmarker,
  312. {get_tensor_shape(src, format),
  313. get_tensor_shape(filter, format),
  314. get_tensor_shape(bias, format),
  315. {},
  316. {}},
  317. algo) /
  318. RUNS;
  319. } else {
  320. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  321. get_tensor_shape(filter, format),
  322. get_tensor_shape(bias, format),
  323. {},
  324. {}}) /
  325. RUNS;
  326. }
  327. Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
  328. ? Format::NCHW32
  329. : Format::NCHW4;
  330. param.format = format_cudnn;
  331. benchmarker_cudnn.set_param(param);
  332. auto time_in_ms_cudnn =
  333. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  334. get_tensor_shape(filter, format_cudnn),
  335. get_tensor_shape(bias, format_cudnn),
  336. {},
  337. {}}) /
  338. RUNS;
  339. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  340. (1e12);
  341. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  342. "time(cudnn)=%.2f %.2fTops, "
  343. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  344. src.to_string().c_str(), filter.to_string().c_str(),
  345. dst.to_string().c_str(), algo, time_in_ms,
  346. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  347. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  348. time_in_ms_cudnn / time_in_ms);
  349. printf("bench with z tensor\n");
  350. if (algo) {
  351. time_in_ms =
  352. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  353. CUTimer>(benchmarker,
  354. {get_tensor_shape(src, format),
  355. get_tensor_shape(filter, format),
  356. get_tensor_shape(bias, format),
  357. get_tensor_shape(z, format),
  358. {}},
  359. algo) /
  360. RUNS;
  361. } else {
  362. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  363. get_tensor_shape(filter, format),
  364. get_tensor_shape(bias, format),
  365. get_tensor_shape(z, format),
  366. {}}) /
  367. RUNS;
  368. }
  369. time_in_ms_cudnn =
  370. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  371. get_tensor_shape(filter, format_cudnn),
  372. get_tensor_shape(bias, format_cudnn),
  373. get_tensor_shape(z, format_cudnn),
  374. {}}) /
  375. RUNS;
  376. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  377. "time(cudnn)=%.2f %.2fTops, "
  378. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  379. src.to_string().c_str(), filter.to_string().c_str(),
  380. dst.to_string().c_str(), algo, time_in_ms,
  381. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  382. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  383. time_in_ms_cudnn / time_in_ms);
  384. }
  385. }
  386. #endif
  387. } // namespace
  388. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) {
  389. require_compute_capability(6, 1);
  390. conv_bias::check_conv_bias(
  391. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  392. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  393. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  394. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(1));
  395. }
  396. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_3x3) {
  397. require_compute_capability(6, 1);
  398. conv_bias::check_conv_bias(
  399. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  400. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  401. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  402. param::ConvBias::Format::NCHW4);
  403. }
  404. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_5x5) {
  405. require_compute_capability(6, 1);
  406. conv_bias::check_conv_bias(
  407. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  408. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  409. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  410. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(5));
  411. }
  412. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_7x7) {
  413. require_compute_capability(6, 1);
  414. conv_bias::check_conv_bias(
  415. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  416. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  417. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  418. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(7));
  419. }
  420. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_WITH_Z) {
  421. require_compute_capability(6, 1);
  422. Checker<ConvBiasForward> checker(handle_cuda());
  423. checker.set_before_exec_callback(
  424. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  425. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  426. UniformIntRNG rng{-3, 3};
  427. UniformIntRNG bias_rng{-50, 50};
  428. checker.set_rng(0, &rng)
  429. .set_rng(1, &rng)
  430. .set_rng(2, &bias_rng)
  431. .set_rng(3, &rng)
  432. .set_dtype(0, dtype::QuantizedS8{1.2f})
  433. .set_dtype(1, dtype::QuantizedS8{1.3f})
  434. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  435. .set_dtype(3, dtype::QuantizedS8{1.1f})
  436. .set_dtype(4, dtype::QuantizedS8{1.0f})
  437. .set_epsilon(1 + 1e-3)
  438. .set_max_avg_error(1e-1)
  439. .set_max_avg_biased_error(1e-1);
  440. param::ConvBias param;
  441. param.pad_h = param.pad_w = 1;
  442. param.stride_h = param.stride_w = 1;
  443. param.format = param::ConvBias::Format::NCHW4;
  444. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  445. {16, 4, 3, 3, 4},
  446. {1, 4, 1, 1, 4},
  447. {32, 4, 12, 12, 4},
  448. {}});
  449. }
  450. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_STRIDE2_WITH_Z) {
  451. require_compute_capability(6, 1);
  452. Checker<ConvBiasForward> checker(handle_cuda());
  453. checker.set_before_exec_callback(
  454. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  455. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  456. UniformIntRNG rng{-3, 3};
  457. UniformIntRNG bias_rng{-50, 50};
  458. checker.set_rng(0, &rng)
  459. .set_rng(1, &rng)
  460. .set_rng(2, &bias_rng)
  461. .set_rng(3, &rng)
  462. .set_dtype(0, dtype::QuantizedS8{1.2f})
  463. .set_dtype(1, dtype::QuantizedS8{1.3f})
  464. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  465. .set_dtype(3, dtype::QuantizedS8{1.1f})
  466. .set_dtype(4, dtype::QuantizedS8{1.0f})
  467. .set_epsilon(1 + 1e-3)
  468. .set_max_avg_error(1e-1)
  469. .set_max_avg_biased_error(1e-1);
  470. param::ConvBias param;
  471. param.pad_h = param.pad_w = 1;
  472. param.stride_h = param.stride_w = 2;
  473. param.format = param::ConvBias::Format::NCHW4;
  474. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  475. {16, 4, 3, 3, 4},
  476. {1, 4, 1, 1, 4},
  477. {32, 4, 6, 6, 4},
  478. {}});
  479. }
  480. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_1x1) {
  481. require_compute_capability(6, 1);
  482. conv_bias::check_conv_bias(
  483. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  484. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  485. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  486. param::ConvBias::Format::NCHW4,
  487. conv_bias::get_int8_nchw4_args_check_bounds(1));
  488. }
  489. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_3x3) {
  490. require_compute_capability(6, 1);
  491. conv_bias::check_conv_bias(
  492. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  493. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  494. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  495. param::ConvBias::Format::NCHW4,
  496. conv_bias::get_int8_nchw4_args_check_bounds(3));
  497. }
  498. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_5x5) {
  499. require_compute_capability(6, 1);
  500. conv_bias::check_conv_bias(
  501. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  502. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  503. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  504. param::ConvBias::Format::NCHW4,
  505. conv_bias::get_int8_nchw4_args_check_bounds(5));
  506. }
  507. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_7x7) {
  508. require_compute_capability(6, 1);
  509. conv_bias::check_conv_bias(
  510. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  511. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  512. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  513. param::ConvBias::Format::NCHW4,
  514. conv_bias::get_int8_nchw4_args_check_bounds(7));
  515. }
  516. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4) {
  517. require_compute_capability(6, 1);
  518. conv_bias::check_conv_bias(
  519. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  520. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  521. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  522. param::ConvBias::Format::CHWN4);
  523. }
  524. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_WITH_Z) {
  525. require_compute_capability(6, 1);
  526. Checker<ConvBiasForward> checker(handle_cuda());
  527. checker.set_before_exec_callback(
  528. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  529. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  530. UniformIntRNG rng{-3, 3};
  531. UniformIntRNG bias_rng{-50, 50};
  532. checker.set_rng(0, &rng)
  533. .set_rng(1, &rng)
  534. .set_rng(2, &bias_rng)
  535. .set_rng(3, &rng)
  536. .set_dtype(0, dtype::QuantizedS8{1.2f})
  537. .set_dtype(1, dtype::QuantizedS8{1.3f})
  538. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  539. .set_dtype(3, dtype::QuantizedS8{1.1f})
  540. .set_dtype(4, dtype::QuantizedS8{1.1f})
  541. .set_epsilon(1 + 1e-3)
  542. .set_max_avg_error(1e-1)
  543. .set_max_avg_biased_error(1e-1);
  544. param::ConvBias param;
  545. param.pad_h = param.pad_w = 1;
  546. param.stride_h = param.stride_w = 1;
  547. param.format = param::ConvBias::Format::CHWN4;
  548. checker.set_param(param).execs({{4, 12, 12, 32, 4},
  549. {4, 3, 3, 16, 4},
  550. {4, 1, 1, 1, 4},
  551. {4, 12, 12, 32, 4},
  552. {}});
  553. }
  554. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
  555. require_compute_capability(6, 1);
  556. Checker<ConvBiasForward> checker(handle_cuda());
  557. checker.set_before_exec_callback(
  558. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  559. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  560. UniformIntRNG rng{-3, 3};
  561. UniformIntRNG bias_rng{-50, 50};
  562. checker.set_rng(0, &rng)
  563. .set_rng(1, &rng)
  564. .set_rng(2, &bias_rng)
  565. .set_rng(3, &rng)
  566. .set_dtype(0, dtype::QuantizedS8{1.2f})
  567. .set_dtype(1, dtype::QuantizedS8{1.3f})
  568. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  569. .set_dtype(4, dtype::QuantizedS8{0.001f})
  570. .set_epsilon(1 + 1e-3)
  571. .set_max_avg_error(1e-1)
  572. .set_max_avg_biased_error(1e-1);
  573. param::ConvBias param;
  574. param.pad_h = param.pad_w = 1;
  575. param.stride_h = param.stride_w = 1;
  576. param.format = param::ConvBias::Format::CHWN4;
  577. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  578. checker.set_param(param).execs(
  579. {{4, 12, 12, 32, 4}, {4, 3, 3, 16, 4}, {4, 1, 1, 1, 4}, {}, {}});
  580. }
  581. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
  582. require_compute_capability(6, 1);
  583. conv_bias::check_conv_bias(
  584. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  585. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  586. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  587. param::ConvBias::Format::CHWN4,
  588. conv_bias::get_int8_chwn4_args_check_bounds(3));
  589. }
  590. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1) {
  591. require_compute_capability(6, 1);
  592. conv_bias::check_conv_bias(
  593. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  594. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  595. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  596. param::ConvBias::Format::CHWN4,
  597. conv_bias::get_int8_chwn4_small_channel_args(1));
  598. }
  599. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_3x3) {
  600. require_compute_capability(6, 1);
  601. conv_bias::check_conv_bias(
  602. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  603. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  604. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  605. param::ConvBias::Format::CHWN4,
  606. conv_bias::get_int8_chwn4_small_channel_args(3));
  607. }
  608. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5) {
  609. require_compute_capability(6, 1);
  610. conv_bias::check_conv_bias(
  611. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  612. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  613. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  614. param::ConvBias::Format::CHWN4,
  615. conv_bias::get_int8_chwn4_small_channel_args(5));
  616. }
  617. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7) {
  618. require_compute_capability(6, 1);
  619. conv_bias::check_conv_bias(
  620. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  621. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  622. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  623. param::ConvBias::Format::CHWN4,
  624. conv_bias::get_int8_chwn4_small_channel_args(7));
  625. }
  626. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_SMALL_CHANNEL_CHECK_BOUNDS) {
  627. require_compute_capability(6, 1);
  628. conv_bias::check_conv_bias(
  629. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  630. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  631. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  632. param::ConvBias::Format::NCHW4,
  633. conv_bias::get_int8_nchw4_small_channel_args_check_bounds(3));
  634. }
  635. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1_CHECK_BOUNDS) {
  636. require_compute_capability(6, 1);
  637. conv_bias::check_conv_bias(
  638. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  639. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  640. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  641. param::ConvBias::Format::CHWN4,
  642. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(1));
  643. }
  644. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5_CHECK_BOUNDS) {
  645. require_compute_capability(6, 1);
  646. conv_bias::check_conv_bias(
  647. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  648. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  649. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  650. param::ConvBias::Format::CHWN4,
  651. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(5));
  652. }
  653. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7_CHECK_BOUNDS) {
  654. require_compute_capability(6, 1);
  655. conv_bias::check_conv_bias(
  656. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  657. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  658. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  659. param::ConvBias::Format::CHWN4,
  660. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(7));
  661. }
  662. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_1x1) {
  663. require_compute_capability(7, 5);
  664. conv_bias::check_conv_bias(
  665. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  666. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  667. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  668. param::ConvBias::Format::NCHW4,
  669. conv_bias::get_int8_nchw4_tensorcore_args(1));
  670. }
  671. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_3x3) {
  672. require_compute_capability(7, 5);
  673. conv_bias::check_conv_bias(
  674. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  675. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  676. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  677. param::ConvBias::Format::NCHW4,
  678. conv_bias::get_int8_nchw4_tensorcore_args(3));
  679. }
  680. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_5x5) {
  681. require_compute_capability(7, 5);
  682. conv_bias::check_conv_bias(
  683. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  684. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  685. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  686. param::ConvBias::Format::NCHW4,
  687. conv_bias::get_int8_nchw4_tensorcore_args(5));
  688. }
  689. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_7x7) {
  690. require_compute_capability(7, 5);
  691. conv_bias::check_conv_bias(
  692. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  693. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  694. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  695. param::ConvBias::Format::NCHW4,
  696. conv_bias::get_int8_nchw4_tensorcore_args(7));
  697. }
  698. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  699. require_compute_capability(7, 5);
  700. conv_bias::check_conv_bias(
  701. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  702. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  703. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  704. param::ConvBias::Format::NCHW4,
  705. conv_bias::get_int8_nchw4_args_check_bounds(3));
  706. }
  707. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  708. require_compute_capability(7, 5);
  709. conv_bias::check_conv_bias(
  710. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  711. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  712. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  713. param::ConvBias::Format::NCHW4,
  714. conv_bias::get_int8_nchw4_args_check_bounds(3));
  715. }
  716. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  717. require_compute_capability(7, 5);
  718. conv_bias::check_conv_bias(
  719. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  720. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  721. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  722. param::ConvBias::Format::NCHW4,
  723. conv_bias::get_int8_nchw4_args_check_bounds(3));
  724. }
  725. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_0) {
  726. require_compute_capability(7, 5);
  727. conv_bias::check_conv_bias(
  728. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  729. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  730. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  731. param::ConvBias::Format::CHWN4,
  732. conv_bias::get_int8_chwn4_tensorcore_args(3));
  733. }
  734. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_1) {
  735. require_compute_capability(7, 5);
  736. conv_bias::check_conv_bias(
  737. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  738. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  739. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  740. param::ConvBias::Format::CHWN4,
  741. conv_bias::get_int8_chwn4_tensorcore_args(3));
  742. }
  743. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_2) {
  744. require_compute_capability(7, 5);
  745. conv_bias::check_conv_bias(
  746. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  747. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  748. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  749. param::ConvBias::Format::CHWN4,
  750. conv_bias::get_int8_chwn4_tensorcore_args(3));
  751. }
  752. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_1x1) {
  753. require_compute_capability(7, 5);
  754. conv_bias::check_conv_bias(
  755. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  756. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  757. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  758. param::ConvBias::Format::CHWN4,
  759. conv_bias::get_int8_chwn4_args_check_bounds(1));
  760. }
  761. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_5x5) {
  762. require_compute_capability(7, 5);
  763. conv_bias::check_conv_bias(
  764. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  765. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  766. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  767. param::ConvBias::Format::CHWN4,
  768. conv_bias::get_int8_chwn4_args_check_bounds(5));
  769. }
  770. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_7x7) {
  771. require_compute_capability(7, 5);
  772. conv_bias::check_conv_bias(
  773. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  774. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  775. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  776. param::ConvBias::Format::CHWN4,
  777. conv_bias::get_int8_chwn4_args_check_bounds(7));
  778. }
  779. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_WITH_Z) {
  780. require_compute_capability(7, 5);
  781. Checker<ConvBiasForward> checker(handle_cuda());
  782. checker.set_before_exec_callback(
  783. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  784. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  785. UniformIntRNG rng{-3, 3};
  786. UniformIntRNG bias_rng{-50, 50};
  787. checker.set_rng(0, &rng)
  788. .set_rng(1, &rng)
  789. .set_rng(2, &bias_rng)
  790. .set_rng(3, &rng)
  791. .set_dtype(0, dtype::QuantizedS8{1.2f})
  792. .set_dtype(1, dtype::QuantizedS8{1.3f})
  793. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  794. .set_dtype(3, dtype::QuantizedS8{1.1f})
  795. .set_dtype(4, dtype::QuantizedS8{1.0f})
  796. .set_epsilon(1 + 1e-3)
  797. .set_max_avg_error(1e-1)
  798. .set_max_avg_biased_error(1e-1);
  799. param::ConvBias param;
  800. param.pad_h = param.pad_w = 1;
  801. param.stride_h = param.stride_w = 1;
  802. param.format = param::ConvBias::Format::NCHW4;
  803. checker.set_param(param).execs({{64, 8, 12, 12, 4},
  804. {64, 8, 3, 3, 4},
  805. {1, 16, 1, 1, 4},
  806. {64, 16, 12, 12, 4},
  807. {}});
  808. }
  809. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_WITH_Z) {
  810. require_compute_capability(7, 5);
  811. Checker<ConvBiasForward> checker(handle_cuda());
  812. checker.set_before_exec_callback(
  813. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  814. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  815. UniformIntRNG rng{-3, 3};
  816. UniformIntRNG bias_rng{-50, 50};
  817. checker.set_rng(0, &rng)
  818. .set_rng(1, &rng)
  819. .set_rng(2, &bias_rng)
  820. .set_rng(3, &rng)
  821. .set_dtype(0, dtype::QuantizedS8{1.2f})
  822. .set_dtype(1, dtype::QuantizedS8{1.3f})
  823. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  824. .set_dtype(3, dtype::QuantizedS8{1.1f})
  825. .set_dtype(4, dtype::QuantizedS8{1.0f})
  826. .set_epsilon(1 + 1e-3)
  827. .set_max_avg_error(1e-1)
  828. .set_max_avg_biased_error(1e-1);
  829. param::ConvBias param;
  830. param.pad_h = param.pad_w = 1;
  831. param.stride_h = param.stride_w = 1;
  832. param.format = param::ConvBias::Format::CHWN4;
  833. checker.set_param(param).execs({{8, 12, 12, 64, 4},
  834. {8, 3, 3, 64, 4},
  835. {16, 1, 1, 1, 4},
  836. {16, 12, 12, 64, 4},
  837. {}});
  838. }
  839. TEST_F(CUDA,
  840. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  841. require_compute_capability(7, 5);
  842. conv_bias::check_conv_bias(
  843. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  844. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  845. handle_cuda(),
  846. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  847. param::ConvBias::Format::CHWN4,
  848. conv_bias::get_int8_chwn4_args_check_bounds(3));
  849. }
  850. TEST_F(CUDA,
  851. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  852. require_compute_capability(7, 5);
  853. conv_bias::check_conv_bias(
  854. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  855. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  856. handle_cuda(),
  857. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  858. param::ConvBias::Format::CHWN4,
  859. conv_bias::get_int8_chwn4_args_check_bounds(3));
  860. }
  861. TEST_F(CUDA,
  862. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  863. require_compute_capability(7, 5);
  864. conv_bias::check_conv_bias(
  865. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  866. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  867. handle_cuda(),
  868. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  869. param::ConvBias::Format::CHWN4,
  870. conv_bias::get_int8_chwn4_args_check_bounds(3));
  871. }
  872. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_0) {
  873. require_compute_capability(7, 5);
  874. conv_bias::check_conv_bias(
  875. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  876. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  877. handle_cuda(),
  878. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  879. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  880. }
  881. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_1) {
  882. require_compute_capability(7, 5);
  883. conv_bias::check_conv_bias(
  884. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  885. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  886. handle_cuda(),
  887. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  888. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  889. }
  890. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_2) {
  891. require_compute_capability(7, 5);
  892. conv_bias::check_conv_bias(
  893. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  894. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  895. handle_cuda(),
  896. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  897. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  898. }
  899. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_0) {
  900. require_compute_capability(7, 5);
  901. conv_bias::check_conv_bias(
  902. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  903. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  904. handle_cuda(),
  905. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  906. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  907. }
  908. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_1) {
  909. require_compute_capability(7, 5);
  910. conv_bias::check_conv_bias(
  911. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  912. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  913. handle_cuda(),
  914. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  915. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  916. }
  917. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_2) {
  918. require_compute_capability(7, 5);
  919. conv_bias::check_conv_bias(
  920. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  921. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  922. handle_cuda(),
  923. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  924. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  925. }
  926. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1) {
  927. require_compute_capability(7, 5);
  928. conv_bias::check_conv_bias(
  929. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  930. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  931. handle_cuda(),
  932. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  933. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(1));
  934. }
  935. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5) {
  936. require_compute_capability(7, 5);
  937. conv_bias::check_conv_bias(
  938. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  939. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  940. handle_cuda(),
  941. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  942. param::ConvBias::Format::CHWN4,
  943. conv_bias::get_int8_chwn4_args_small_batch(5));
  944. }
  945. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_7x7) {
  946. require_compute_capability(7, 5);
  947. conv_bias::check_conv_bias(
  948. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  949. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  950. handle_cuda(),
  951. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  952. param::ConvBias::Format::CHWN4,
  953. conv_bias::get_int8_chwn4_args_small_batch(7));
  954. }
  955. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_1) {
  956. require_compute_capability(7, 5);
  957. conv_bias::check_conv_bias(
  958. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  959. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  960. handle_cuda(),
  961. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  962. param::ConvBias::Format::CHWN4,
  963. conv_bias::get_int8_chwn4_args_small_batch(5));
  964. }
  965. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_2) {
  966. require_compute_capability(7, 5);
  967. conv_bias::check_conv_bias(
  968. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  969. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  970. handle_cuda(),
  971. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  972. param::ConvBias::Format::CHWN4,
  973. conv_bias::get_int8_chwn4_args_small_batch(5));
  974. }
  975. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_1) {
  976. require_compute_capability(7, 5);
  977. conv_bias::check_conv_bias(
  978. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  979. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  980. handle_cuda(),
  981. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  982. param::ConvBias::Format::CHWN4,
  983. conv_bias::get_int8_chwn4_args_small_batch(1));
  984. }
  985. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
  986. require_compute_capability(7, 5);
  987. conv_bias::check_conv_bias(
  988. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  989. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  990. handle_cuda(),
  991. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  992. param::ConvBias::Format::CHWN4,
  993. conv_bias::get_int8_chwn4_args_small_batch(1));
  994. }
  995. #if CUDA_VERSION >= 10020
  996. /// \note: we only check several cases and block sizes in megdnn_test, the full
  997. /// testcases are written in cutlass repository
  998. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
  999. require_compute_capability_eq(7, 5);
  1000. Checker<ConvBiasForward> checker(handle_cuda());
  1001. auto check = [&checker](const std::string& algo) {
  1002. checker.set_before_exec_callback(
  1003. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  1004. UniformIntRNG rng{-8, 8};
  1005. UniformIntRNG bias_rng{-50, 50};
  1006. UniformIntRNG const_rng{1, 1};
  1007. // use scale that are all integers to avoid rouding error
  1008. checker.set_rng(0, &rng)
  1009. .set_rng(1, &rng)
  1010. .set_rng(2, &bias_rng)
  1011. .set_rng(3, &rng)
  1012. .set_dtype(0, dtype::QuantizedS8{6.0f})
  1013. .set_dtype(1, dtype::QuantizedS8{1.0f})
  1014. .set_dtype(2, dtype::QuantizedS32{6.0f})
  1015. .set_dtype(3, dtype::QuantizedS8{1.0f})
  1016. .set_dtype(4, dtype::QuantizedS8{6.0f})
  1017. .set_epsilon(1e-3);
  1018. param::ConvBias param;
  1019. param.pad_h = param.pad_w = 1;
  1020. param.stride_h = param.stride_w = 1;
  1021. param.format = param::ConvBias::Format::NCHW32;
  1022. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1023. {512, 16, 3, 3, 32},
  1024. {1, 16, 1, 1, 32},
  1025. {},
  1026. {}});
  1027. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1028. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1029. {512, 16, 1, 1, 32},
  1030. {1, 16, 1, 1, 32},
  1031. {},
  1032. {}});
  1033. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  1034. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1035. {512, 16, 3, 3, 32},
  1036. {1, 16, 1, 1, 32},
  1037. {},
  1038. {}});
  1039. // use non integer scale
  1040. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  1041. checker.set_dtype(0, dtype::QuantizedS8{1.1f})
  1042. .set_dtype(1, dtype::QuantizedS8{1.2f})
  1043. .set_dtype(2, dtype::QuantizedS32{1.1f * 1.2f})
  1044. .set_dtype(3, dtype::QuantizedS8{1.1f})
  1045. .set_dtype(4, dtype::QuantizedS8{6.0f})
  1046. .set_epsilon(1 + 1e-3)
  1047. .set_max_avg_error(1e-1)
  1048. .set_max_avg_biased_error(1e-1)
  1049. .execs({{16, 16, 7, 7, 32},
  1050. {512, 16, 3, 3, 32},
  1051. {1, 16, 1, 1, 32},
  1052. {16, 16, 7, 7, 32},
  1053. {}});
  1054. };
  1055. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1056. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_256X128X64_64X64X64",
  1057. ConvBias::DirectParam{});
  1058. check(algo);
  1059. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1060. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X64X64_32X16X64",
  1061. ConvBias::DirectParam{});
  1062. check(algo);
  1063. }
  1064. #endif
  1065. #if MEGDNN_WITH_BENCHMARK
  1066. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) {
  1067. require_compute_capability(6, 1);
  1068. benchmark_target_algo(
  1069. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1070. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1071. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1072. param::ConvBias::Format::CHWN4);
  1073. }
  1074. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4) {
  1075. require_compute_capability(6, 1);
  1076. benchmark_target_algo(
  1077. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1078. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1079. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  1080. param::ConvBias::Format::NCHW4);
  1081. }
  1082. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE) {
  1083. require_compute_capability(7, 5);
  1084. benchmark_target_algo_with_cudnn_tsc(
  1085. handle_cuda(), get_resnet50_bench_args(256),
  1086. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1087. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1088. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1089. param::ConvBias::Format::CHWN4);
  1090. }
  1091. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE_ALL_ALGO) {
  1092. require_compute_capability(7, 5);
  1093. benchmark_target_algo_with_cudnn_tsc(
  1094. handle_cuda(), get_resnet50_bench_args(256),
  1095. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1096. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, nullptr,
  1097. param::ConvBias::Format::CHWN4);
  1098. }
  1099. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_DET_ALL_ALGO) {
  1100. require_compute_capability(7, 5);
  1101. benchmark_target_algo_with_cudnn_tsc(
  1102. handle_cuda(), get_detection_bench_args(), dtype::QuantizedS8{1.2f},
  1103. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1104. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  1105. }
  1106. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_TENSORCORE) {
  1107. require_compute_capability(7, 5);
  1108. benchmark_target_algo_with_cudnn_tsc(
  1109. handle_cuda(), get_resnet50_bench_args(256),
  1110. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1111. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1112. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1113. param::ConvBias::Format::NCHW4);
  1114. }
  1115. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
  1116. require_compute_capability(6, 1);
  1117. std::vector<BenchArgs> args;
  1118. args.push_back(BenchArgs{64, 4, 224, 224, 64, 7, 2});
  1119. benchmark_target_algo(
  1120. handle_cuda(), args, dtype::QuantizedS8{1.2f},
  1121. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1122. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1123. param::ConvBias::Format::CHWN4);
  1124. }
  1125. #if CUDA_VERSION >= 10020
  1126. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
  1127. require_compute_capability(7, 5);
  1128. benchmark_target_algo_with_cudnn_tsc(
  1129. handle_cuda(), get_resnet50_bench_args(256),
  1130. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1131. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1132. "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
  1133. param::ConvBias::Format::NCHW32);
  1134. }
  1135. #endif
  1136. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
  1137. require_compute_capability(6, 1);
  1138. benchmark_target_algo(
  1139. handle_cuda(), get_resnet50_bench_args(64),
  1140. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1141. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1142. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4);
  1143. }
  1144. #endif
  1145. } // namespace test
  1146. } // namespace megdnn
  1147. #undef V1
  1148. #undef V
  1149. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台