You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_int8.cpp 56 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275
  1. /**
  2. * \file dnn/test/cuda/conv_bias_int8.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/oprs/nn.h"
  13. #include "src/common/utils.h"
  14. #include "src/cuda/cudnn_with_check.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/conv_bias.h"
  17. #include "test/cuda/benchmark.h"
  18. #include "test/cuda/fixture.h"
  19. #include "test/cuda/utils.h"
  20. #include "test/common/tensor.h"
  21. #include "test/common/workspace_wrapper.h"
  22. #define V1(x) #x
  23. #define V(x) V1(x)
  24. namespace megdnn {
  25. namespace test {
  26. namespace {
  27. #if MEGDNN_WITH_BENCHMARK
  28. struct BenchArgs {
  29. size_t n, ci, hi, wi, co, f, s;
  30. };
  31. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
  32. std::vector<BenchArgs> args;
  33. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  34. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  35. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  36. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  37. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  38. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  39. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  40. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  41. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  42. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  43. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  44. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  45. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  46. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  47. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  48. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  49. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  50. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  51. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  52. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 2});
  53. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  54. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  55. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  56. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  57. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  58. return args;
  59. }
  60. std::vector<BenchArgs> get_detection_bench_args(size_t batch = 16) {
  61. std::vector<BenchArgs> args;
  62. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  63. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  64. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  66. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  67. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  68. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  69. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  70. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  71. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  72. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  73. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  74. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  75. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  76. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  77. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  78. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  79. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  80. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  81. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  82. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  83. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  84. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  85. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  86. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  87. return args;
  88. }
  89. std::vector<BenchArgs> get_det_first_bench_args(size_t batch = 16) {
  90. std::vector<BenchArgs> args;
  91. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 16, 3, 2});
  92. args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1});
  93. return args;
  94. }
  95. void benchmark_target_algo(
  96. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  97. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  98. const char* algo = nullptr,
  99. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  100. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  101. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  102. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  103. size_t RUNS = 1000;
  104. benchmarker.set_display(false).set_times(RUNS);
  105. benchmarker_cudnn.set_display(false).set_times(RUNS);
  106. #define CUDNN_VERSION_STRING \
  107. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  108. benchmarker_cudnn.set_before_exec_callback(
  109. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  110. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  111. "ALGO_IMPLICIT_PRECOMP_"
  112. "GEMM" CUDNN_VERSION_STRING));
  113. benchmarker.set_dtype(0, src_dtype)
  114. .set_dtype(1, filter_dtype)
  115. .set_dtype(2, bias_dtype)
  116. .set_dtype(3, dst_dtype)
  117. .set_dtype(4, dst_dtype);
  118. benchmarker_cudnn.set_dtype(0, src_dtype)
  119. .set_dtype(1, filter_dtype)
  120. .set_dtype(2, bias_dtype)
  121. .set_dtype(3, dst_dtype)
  122. .set_dtype(4, dst_dtype);
  123. using Param = ConvBias::Param;
  124. using Format = Param::Format;
  125. // helper function to change format
  126. auto get_tensor_shape = [](TensorShape shape,
  127. Format format) -> TensorShape {
  128. TensorShape ret;
  129. if (format == Format::NCHW4) {
  130. ret = static_cast<TensorShape>(
  131. TensorLayout{shape, dtype::Int8()}
  132. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  133. shape[3]})
  134. .dimshuffle({0, 1, 3, 4, 2}));
  135. } else if (format == Format::CHWN4) {
  136. ret = static_cast<TensorShape>(
  137. TensorLayout{shape, dtype::Int8()}
  138. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  139. shape[3]})
  140. .dimshuffle({1, 3, 4, 0, 2}));
  141. }
  142. return ret;
  143. };
  144. for (auto&& arg : args) {
  145. Param param;
  146. param.pad_h = param.pad_w = arg.f / 2;
  147. param.stride_h = param.stride_w = arg.s;
  148. param.format = format;
  149. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  150. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  151. benchmarker.set_param(param);
  152. if (!algo) {
  153. benchmarker.proxy()->target_algo = nullptr;
  154. }
  155. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  156. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  157. z{arg.n, arg.co, ho, wo}, dst = z;
  158. float time_in_ms = 0.f;
  159. if (algo) {
  160. time_in_ms =
  161. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  162. CUTimer>(benchmarker,
  163. {get_tensor_shape(src, format),
  164. get_tensor_shape(filter, format),
  165. get_tensor_shape(bias, format),
  166. {},
  167. {}},
  168. algo) /
  169. RUNS;
  170. } else {
  171. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  172. get_tensor_shape(filter, format),
  173. get_tensor_shape(bias, format),
  174. {},
  175. {}}) /
  176. RUNS;
  177. }
  178. Format format_cudnn = Format::NCHW4;
  179. param.format = format_cudnn;
  180. benchmarker_cudnn.set_param(param);
  181. auto time_in_ms_cudnn =
  182. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  183. get_tensor_shape(filter, format_cudnn),
  184. get_tensor_shape(bias, format_cudnn),
  185. {},
  186. {}}) /
  187. RUNS;
  188. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  189. (1e12);
  190. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  191. "time(cudnn)=%.2f %.2fTops, "
  192. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  193. src.to_string().c_str(), filter.to_string().c_str(),
  194. dst.to_string().c_str(), algo, time_in_ms,
  195. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  196. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  197. time_in_ms_cudnn / time_in_ms);
  198. printf("bench with z tensor\n");
  199. if (algo) {
  200. time_in_ms =
  201. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  202. CUTimer>(benchmarker,
  203. {get_tensor_shape(src, format),
  204. get_tensor_shape(filter, format),
  205. get_tensor_shape(bias, format),
  206. get_tensor_shape(z, format),
  207. {}},
  208. algo) /
  209. RUNS;
  210. } else {
  211. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  212. get_tensor_shape(filter, format),
  213. get_tensor_shape(bias, format),
  214. get_tensor_shape(z, format),
  215. {}}) /
  216. RUNS;
  217. }
  218. time_in_ms_cudnn =
  219. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  220. get_tensor_shape(filter, format_cudnn),
  221. get_tensor_shape(bias, format_cudnn),
  222. get_tensor_shape(z, format_cudnn),
  223. {}}) /
  224. RUNS;
  225. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  226. "time(cudnn)=%.2f %.2fTops, "
  227. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  228. src.to_string().c_str(), filter.to_string().c_str(),
  229. dst.to_string().c_str(), algo, time_in_ms,
  230. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  231. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  232. time_in_ms_cudnn / time_in_ms);
  233. }
  234. }
  235. void benchmark_target_algo_with_cudnn_tsc(
  236. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  237. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  238. const char* algo = nullptr,
  239. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  240. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  241. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  242. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  243. size_t RUNS = 1000;
  244. benchmarker.set_display(false).set_times(RUNS);
  245. benchmarker_cudnn.set_display(false).set_times(RUNS);
  246. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  247. new OprProxy<ConvBiasForward>{true}};
  248. if (!algo) {
  249. benchmarker.set_proxy(proxy);
  250. }
  251. benchmarker_cudnn.set_before_exec_callback(
  252. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  253. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  254. "ALGO_IMPLICIT_PRECOMP_"
  255. "GEMM" CUDNN_VERSION_STRING));
  256. #undef CUDNN_VERSION_STRING
  257. benchmarker.set_dtype(0, src_dtype)
  258. .set_dtype(1, filter_dtype)
  259. .set_dtype(2, bias_dtype)
  260. .set_dtype(3, dst_dtype)
  261. .set_dtype(4, dst_dtype);
  262. benchmarker_cudnn.set_dtype(0, src_dtype)
  263. .set_dtype(1, filter_dtype)
  264. .set_dtype(2, bias_dtype)
  265. .set_dtype(3, dst_dtype)
  266. .set_dtype(4, dst_dtype);
  267. using Param = ConvBias::Param;
  268. using Format = Param::Format;
  269. // helper function to change format
  270. auto get_tensor_shape = [](TensorShape shape,
  271. Format format) -> TensorShape {
  272. TensorShape ret;
  273. if (format == Format::NCHW4) {
  274. ret = static_cast<TensorShape>(
  275. TensorLayout{shape, dtype::Int8()}
  276. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  277. shape[3]})
  278. .dimshuffle({0, 1, 3, 4, 2}));
  279. } else if (format == Format::NCHW32) {
  280. ret = static_cast<TensorShape>(
  281. TensorLayout{shape, dtype::Int8()}
  282. .reshape({shape[0], shape[1] / 32, 32, shape[2],
  283. shape[3]})
  284. .dimshuffle({0, 1, 3, 4, 2}));
  285. } else if (format == Format::CHWN4) {
  286. ret = static_cast<TensorShape>(
  287. TensorLayout{shape, dtype::Int8()}
  288. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  289. shape[3]})
  290. .dimshuffle({1, 3, 4, 0, 2}));
  291. }
  292. return ret;
  293. };
  294. for (auto&& arg : args) {
  295. Param param;
  296. param.pad_h = param.pad_w = arg.f / 2;
  297. param.stride_h = param.stride_w = arg.s;
  298. param.format = format;
  299. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  300. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  301. benchmarker.set_param(param);
  302. if (!algo) {
  303. benchmarker.proxy()->target_algo = nullptr;
  304. }
  305. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  306. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  307. z{arg.n, arg.co, ho, wo}, dst = z;
  308. // skip testcase which cannot enable nchw32 tensorcore
  309. if (format == Format::NCHW32 && (arg.co % 32 != 0 || arg.ci % 32 != 0))
  310. continue;
  311. // skip testcase which cannot enable nchw4/chwn4 tensorcore
  312. if ((format == Format::CHWN4 || format == Format::NCHW4) &&
  313. (arg.ci % 16 != 0))
  314. continue;
  315. Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
  316. ? Format::NCHW32
  317. : Format::NCHW4;
  318. param.format = format_cudnn;
  319. benchmarker_cudnn.set_param(param);
  320. float time_in_ms = 0.f;
  321. if (algo) {
  322. time_in_ms =
  323. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  324. CUTimer>(benchmarker,
  325. {get_tensor_shape(src, format),
  326. get_tensor_shape(filter, format),
  327. get_tensor_shape(bias, format),
  328. {},
  329. {}},
  330. algo) /
  331. RUNS;
  332. } else {
  333. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  334. get_tensor_shape(filter, format),
  335. get_tensor_shape(bias, format),
  336. {},
  337. {}}) /
  338. RUNS;
  339. }
  340. float time_in_ms_cudnn =
  341. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  342. get_tensor_shape(filter, format_cudnn),
  343. get_tensor_shape(bias, format_cudnn),
  344. {},
  345. {}}) /
  346. RUNS;
  347. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  348. (1e12);
  349. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  350. "time(cudnn)=%.2f %.2fTops, "
  351. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  352. src.to_string().c_str(), filter.to_string().c_str(),
  353. dst.to_string().c_str(), algo, time_in_ms,
  354. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  355. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  356. time_in_ms_cudnn / time_in_ms);
  357. printf("bench with z tensor\n");
  358. if (algo) {
  359. time_in_ms =
  360. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  361. CUTimer>(benchmarker,
  362. {get_tensor_shape(src, format),
  363. get_tensor_shape(filter, format),
  364. get_tensor_shape(bias, format),
  365. get_tensor_shape(z, format),
  366. {}},
  367. algo) /
  368. RUNS;
  369. } else {
  370. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  371. get_tensor_shape(filter, format),
  372. get_tensor_shape(bias, format),
  373. get_tensor_shape(z, format),
  374. {}}) /
  375. RUNS;
  376. }
  377. time_in_ms_cudnn =
  378. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  379. get_tensor_shape(filter, format_cudnn),
  380. get_tensor_shape(bias, format_cudnn),
  381. get_tensor_shape(z, format_cudnn),
  382. {}}) /
  383. RUNS;
  384. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  385. "time(cudnn)=%.2f %.2fTops, "
  386. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  387. src.to_string().c_str(), filter.to_string().c_str(),
  388. dst.to_string().c_str(), algo, time_in_ms,
  389. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  390. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  391. time_in_ms_cudnn / time_in_ms);
  392. }
  393. }
  394. #endif
  395. } // namespace
  396. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) {
  397. require_compute_capability(6, 1);
  398. conv_bias::check_conv_bias(
  399. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  400. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  401. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  402. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(1));
  403. }
  404. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_3x3) {
  405. require_compute_capability(6, 1);
  406. conv_bias::check_conv_bias(
  407. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  408. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  409. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  410. param::ConvBias::Format::NCHW4);
  411. }
  412. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_5x5) {
  413. require_compute_capability(6, 1);
  414. conv_bias::check_conv_bias(
  415. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  416. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  417. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  418. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(5));
  419. }
  420. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_7x7) {
  421. require_compute_capability(6, 1);
  422. conv_bias::check_conv_bias(
  423. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  424. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  425. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  426. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(7));
  427. }
  428. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_WITH_Z) {
  429. require_compute_capability(6, 1);
  430. Checker<ConvBiasForward> checker(handle_cuda());
  431. checker.set_before_exec_callback(
  432. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  433. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  434. UniformIntRNG rng{-3, 3};
  435. UniformIntRNG bias_rng{-50, 50};
  436. checker.set_rng(0, &rng)
  437. .set_rng(1, &rng)
  438. .set_rng(2, &bias_rng)
  439. .set_rng(3, &rng)
  440. .set_dtype(0, dtype::QuantizedS8{1.2f})
  441. .set_dtype(1, dtype::QuantizedS8{1.3f})
  442. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  443. .set_dtype(3, dtype::QuantizedS8{1.1f})
  444. .set_dtype(4, dtype::QuantizedS8{1.0f})
  445. .set_epsilon(1 + 1e-3)
  446. .set_max_avg_error(1e-1)
  447. .set_max_avg_biased_error(1e-1);
  448. param::ConvBias param;
  449. param.pad_h = param.pad_w = 1;
  450. param.stride_h = param.stride_w = 1;
  451. param.format = param::ConvBias::Format::NCHW4;
  452. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  453. {16, 4, 3, 3, 4},
  454. {1, 4, 1, 1, 4},
  455. {32, 4, 12, 12, 4},
  456. {}});
  457. }
  458. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_STRIDE2_WITH_Z) {
  459. require_compute_capability(6, 1);
  460. Checker<ConvBiasForward> checker(handle_cuda());
  461. checker.set_before_exec_callback(
  462. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  463. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  464. UniformIntRNG rng{-3, 3};
  465. UniformIntRNG bias_rng{-50, 50};
  466. checker.set_rng(0, &rng)
  467. .set_rng(1, &rng)
  468. .set_rng(2, &bias_rng)
  469. .set_rng(3, &rng)
  470. .set_dtype(0, dtype::QuantizedS8{1.2f})
  471. .set_dtype(1, dtype::QuantizedS8{1.3f})
  472. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  473. .set_dtype(3, dtype::QuantizedS8{1.1f})
  474. .set_dtype(4, dtype::QuantizedS8{1.0f})
  475. .set_epsilon(1 + 1e-3)
  476. .set_max_avg_error(1e-1)
  477. .set_max_avg_biased_error(1e-1);
  478. param::ConvBias param;
  479. param.pad_h = param.pad_w = 1;
  480. param.stride_h = param.stride_w = 2;
  481. param.format = param::ConvBias::Format::NCHW4;
  482. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  483. {16, 4, 3, 3, 4},
  484. {1, 4, 1, 1, 4},
  485. {32, 4, 6, 6, 4},
  486. {}});
  487. }
  488. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_1x1) {
  489. require_compute_capability(6, 1);
  490. conv_bias::check_conv_bias(
  491. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  492. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  493. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  494. param::ConvBias::Format::NCHW4,
  495. conv_bias::get_int8_nchw4_args_check_bounds(1));
  496. }
  497. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_3x3) {
  498. require_compute_capability(6, 1);
  499. conv_bias::check_conv_bias(
  500. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  501. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  502. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  503. param::ConvBias::Format::NCHW4,
  504. conv_bias::get_int8_nchw4_args_check_bounds(3));
  505. }
  506. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_5x5) {
  507. require_compute_capability(6, 1);
  508. conv_bias::check_conv_bias(
  509. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  510. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  511. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  512. param::ConvBias::Format::NCHW4,
  513. conv_bias::get_int8_nchw4_args_check_bounds(5));
  514. }
  515. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_7x7) {
  516. require_compute_capability(6, 1);
  517. conv_bias::check_conv_bias(
  518. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  519. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  520. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  521. param::ConvBias::Format::NCHW4,
  522. conv_bias::get_int8_nchw4_args_check_bounds(7));
  523. }
  524. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4) {
  525. require_compute_capability(6, 1);
  526. conv_bias::check_conv_bias(
  527. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  528. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  529. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  530. param::ConvBias::Format::CHWN4);
  531. }
  532. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_WITH_Z) {
  533. require_compute_capability(6, 1);
  534. Checker<ConvBiasForward> checker(handle_cuda());
  535. checker.set_before_exec_callback(
  536. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  537. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  538. UniformIntRNG rng{-3, 3};
  539. UniformIntRNG bias_rng{-50, 50};
  540. checker.set_rng(0, &rng)
  541. .set_rng(1, &rng)
  542. .set_rng(2, &bias_rng)
  543. .set_rng(3, &rng)
  544. .set_dtype(0, dtype::QuantizedS8{1.2f})
  545. .set_dtype(1, dtype::QuantizedS8{1.3f})
  546. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  547. .set_dtype(3, dtype::QuantizedS8{1.1f})
  548. .set_dtype(4, dtype::QuantizedS8{1.1f})
  549. .set_epsilon(1 + 1e-3)
  550. .set_max_avg_error(1e-1)
  551. .set_max_avg_biased_error(1e-1);
  552. param::ConvBias param;
  553. param.pad_h = param.pad_w = 1;
  554. param.stride_h = param.stride_w = 1;
  555. param.format = param::ConvBias::Format::CHWN4;
  556. checker.set_param(param).execs({{4, 12, 12, 32, 4},
  557. {4, 3, 3, 16, 4},
  558. {4, 1, 1, 1, 4},
  559. {4, 12, 12, 32, 4},
  560. {}});
  561. }
  562. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
  563. require_compute_capability(6, 1);
  564. Checker<ConvBiasForward> checker(handle_cuda());
  565. checker.set_before_exec_callback(
  566. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  567. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  568. UniformIntRNG rng{-3, 3};
  569. UniformIntRNG bias_rng{-50, 50};
  570. checker.set_rng(0, &rng)
  571. .set_rng(1, &rng)
  572. .set_rng(2, &bias_rng)
  573. .set_rng(3, &rng)
  574. .set_dtype(0, dtype::QuantizedS8{1.2f})
  575. .set_dtype(1, dtype::QuantizedS8{1.3f})
  576. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  577. .set_dtype(4, dtype::QuantizedS8{0.001f})
  578. .set_epsilon(1 + 1e-3)
  579. .set_max_avg_error(1e-1)
  580. .set_max_avg_biased_error(1e-1);
  581. param::ConvBias param;
  582. param.pad_h = param.pad_w = 1;
  583. param.stride_h = param.stride_w = 1;
  584. param.format = param::ConvBias::Format::CHWN4;
  585. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  586. checker.set_param(param).execs(
  587. {{4, 12, 12, 32, 4}, {4, 3, 3, 16, 4}, {4, 1, 1, 1, 4}, {}, {}});
  588. }
  589. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
  590. require_compute_capability(6, 1);
  591. conv_bias::check_conv_bias(
  592. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  593. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  594. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  595. param::ConvBias::Format::CHWN4,
  596. conv_bias::get_int8_chwn4_args_check_bounds(3));
  597. }
  598. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1) {
  599. require_compute_capability(6, 1);
  600. conv_bias::check_conv_bias(
  601. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  602. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  603. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  604. param::ConvBias::Format::CHWN4,
  605. conv_bias::get_int8_chwn4_small_channel_args(1));
  606. }
  607. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_3x3) {
  608. require_compute_capability(6, 1);
  609. conv_bias::check_conv_bias(
  610. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  611. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  612. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  613. param::ConvBias::Format::CHWN4,
  614. conv_bias::get_int8_chwn4_small_channel_args(3));
  615. }
  616. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5) {
  617. require_compute_capability(6, 1);
  618. conv_bias::check_conv_bias(
  619. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  620. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  621. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  622. param::ConvBias::Format::CHWN4,
  623. conv_bias::get_int8_chwn4_small_channel_args(5));
  624. }
  625. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7) {
  626. require_compute_capability(6, 1);
  627. conv_bias::check_conv_bias(
  628. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  629. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  630. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  631. param::ConvBias::Format::CHWN4,
  632. conv_bias::get_int8_chwn4_small_channel_args(7));
  633. }
  634. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_SMALL_CHANNEL_CHECK_BOUNDS) {
  635. require_compute_capability(6, 1);
  636. conv_bias::check_conv_bias(
  637. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  638. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  639. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  640. param::ConvBias::Format::NCHW4,
  641. conv_bias::get_int8_nchw4_small_channel_args_check_bounds(3));
  642. }
  643. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1_CHECK_BOUNDS) {
  644. require_compute_capability(6, 1);
  645. conv_bias::check_conv_bias(
  646. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  647. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  648. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  649. param::ConvBias::Format::CHWN4,
  650. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(1));
  651. }
  652. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5_CHECK_BOUNDS) {
  653. require_compute_capability(6, 1);
  654. conv_bias::check_conv_bias(
  655. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  656. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  657. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  658. param::ConvBias::Format::CHWN4,
  659. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(5));
  660. }
  661. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7_CHECK_BOUNDS) {
  662. require_compute_capability(6, 1);
  663. conv_bias::check_conv_bias(
  664. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  665. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  666. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  667. param::ConvBias::Format::CHWN4,
  668. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(7));
  669. }
  670. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_1x1) {
  671. require_compute_capability(7, 5);
  672. conv_bias::check_conv_bias(
  673. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  674. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  675. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  676. param::ConvBias::Format::NCHW4,
  677. conv_bias::get_int8_nchw4_tensorcore_args(1));
  678. }
  679. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_3x3) {
  680. require_compute_capability(7, 5);
  681. conv_bias::check_conv_bias(
  682. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  683. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  684. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  685. param::ConvBias::Format::NCHW4,
  686. conv_bias::get_int8_nchw4_tensorcore_args(3));
  687. }
  688. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_5x5) {
  689. require_compute_capability(7, 5);
  690. conv_bias::check_conv_bias(
  691. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  692. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  693. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  694. param::ConvBias::Format::NCHW4,
  695. conv_bias::get_int8_nchw4_tensorcore_args(5));
  696. }
  697. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_7x7) {
  698. require_compute_capability(7, 5);
  699. conv_bias::check_conv_bias(
  700. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  701. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  702. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  703. param::ConvBias::Format::NCHW4,
  704. conv_bias::get_int8_nchw4_tensorcore_args(7));
  705. }
  706. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  707. require_compute_capability(7, 5);
  708. conv_bias::check_conv_bias(
  709. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  710. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  711. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  712. param::ConvBias::Format::NCHW4,
  713. conv_bias::get_int8_nchw4_args_check_bounds(3));
  714. }
  715. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  716. require_compute_capability(7, 5);
  717. conv_bias::check_conv_bias(
  718. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  719. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  720. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  721. param::ConvBias::Format::NCHW4,
  722. conv_bias::get_int8_nchw4_args_check_bounds(3));
  723. }
  724. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  725. require_compute_capability(7, 5);
  726. conv_bias::check_conv_bias(
  727. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  728. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  729. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  730. param::ConvBias::Format::NCHW4,
  731. conv_bias::get_int8_nchw4_args_check_bounds(3));
  732. }
  733. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_0) {
  734. require_compute_capability(7, 5);
  735. conv_bias::check_conv_bias(
  736. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  737. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  738. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  739. param::ConvBias::Format::CHWN4,
  740. conv_bias::get_int8_chwn4_tensorcore_args(3));
  741. }
  742. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_1) {
  743. require_compute_capability(7, 5);
  744. conv_bias::check_conv_bias(
  745. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  746. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  747. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  748. param::ConvBias::Format::CHWN4,
  749. conv_bias::get_int8_chwn4_tensorcore_args(3));
  750. }
  751. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_2) {
  752. require_compute_capability(7, 5);
  753. conv_bias::check_conv_bias(
  754. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  755. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  756. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  757. param::ConvBias::Format::CHWN4,
  758. conv_bias::get_int8_chwn4_tensorcore_args(3));
  759. }
  760. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_1x1) {
  761. require_compute_capability(7, 5);
  762. conv_bias::check_conv_bias(
  763. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  764. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  765. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  766. param::ConvBias::Format::CHWN4,
  767. conv_bias::get_int8_chwn4_args_check_bounds(1));
  768. }
  769. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_5x5) {
  770. require_compute_capability(7, 5);
  771. conv_bias::check_conv_bias(
  772. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  773. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  774. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  775. param::ConvBias::Format::CHWN4,
  776. conv_bias::get_int8_chwn4_args_check_bounds(5));
  777. }
  778. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_7x7) {
  779. require_compute_capability(7, 5);
  780. conv_bias::check_conv_bias(
  781. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  782. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  783. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  784. param::ConvBias::Format::CHWN4,
  785. conv_bias::get_int8_chwn4_args_check_bounds(7));
  786. }
  787. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_WITH_Z) {
  788. require_compute_capability(7, 5);
  789. Checker<ConvBiasForward> checker(handle_cuda());
  790. checker.set_before_exec_callback(
  791. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  792. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  793. UniformIntRNG rng{-3, 3};
  794. UniformIntRNG bias_rng{-50, 50};
  795. checker.set_rng(0, &rng)
  796. .set_rng(1, &rng)
  797. .set_rng(2, &bias_rng)
  798. .set_rng(3, &rng)
  799. .set_dtype(0, dtype::QuantizedS8{1.2f})
  800. .set_dtype(1, dtype::QuantizedS8{1.3f})
  801. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  802. .set_dtype(3, dtype::QuantizedS8{1.1f})
  803. .set_dtype(4, dtype::QuantizedS8{1.0f})
  804. .set_epsilon(1 + 1e-3)
  805. .set_max_avg_error(1e-1)
  806. .set_max_avg_biased_error(1e-1);
  807. param::ConvBias param;
  808. param.pad_h = param.pad_w = 1;
  809. param.stride_h = param.stride_w = 1;
  810. param.format = param::ConvBias::Format::NCHW4;
  811. checker.set_param(param).execs({{64, 8, 12, 12, 4},
  812. {64, 8, 3, 3, 4},
  813. {1, 16, 1, 1, 4},
  814. {64, 16, 12, 12, 4},
  815. {}});
  816. }
  817. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_WITH_Z) {
  818. require_compute_capability(7, 5);
  819. Checker<ConvBiasForward> checker(handle_cuda());
  820. checker.set_before_exec_callback(
  821. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  822. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  823. UniformIntRNG rng{-3, 3};
  824. UniformIntRNG bias_rng{-50, 50};
  825. checker.set_rng(0, &rng)
  826. .set_rng(1, &rng)
  827. .set_rng(2, &bias_rng)
  828. .set_rng(3, &rng)
  829. .set_dtype(0, dtype::QuantizedS8{1.2f})
  830. .set_dtype(1, dtype::QuantizedS8{1.3f})
  831. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  832. .set_dtype(3, dtype::QuantizedS8{1.1f})
  833. .set_dtype(4, dtype::QuantizedS8{1.0f})
  834. .set_epsilon(1 + 1e-3)
  835. .set_max_avg_error(1e-1)
  836. .set_max_avg_biased_error(1e-1);
  837. param::ConvBias param;
  838. param.pad_h = param.pad_w = 1;
  839. param.stride_h = param.stride_w = 1;
  840. param.format = param::ConvBias::Format::CHWN4;
  841. checker.set_param(param).execs({{8, 12, 12, 64, 4},
  842. {8, 3, 3, 64, 4},
  843. {16, 1, 1, 1, 4},
  844. {16, 12, 12, 64, 4},
  845. {}});
  846. }
  847. TEST_F(CUDA,
  848. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  849. require_compute_capability(7, 5);
  850. conv_bias::check_conv_bias(
  851. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  852. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  853. handle_cuda(),
  854. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  855. param::ConvBias::Format::CHWN4,
  856. conv_bias::get_int8_chwn4_args_check_bounds(3));
  857. }
  858. TEST_F(CUDA,
  859. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  860. require_compute_capability(7, 5);
  861. conv_bias::check_conv_bias(
  862. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  863. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  864. handle_cuda(),
  865. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  866. param::ConvBias::Format::CHWN4,
  867. conv_bias::get_int8_chwn4_args_check_bounds(3));
  868. }
  869. TEST_F(CUDA,
  870. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  871. require_compute_capability(7, 5);
  872. conv_bias::check_conv_bias(
  873. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  874. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  875. handle_cuda(),
  876. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  877. param::ConvBias::Format::CHWN4,
  878. conv_bias::get_int8_chwn4_args_check_bounds(3));
  879. }
  880. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_0) {
  881. require_compute_capability(7, 5);
  882. conv_bias::check_conv_bias(
  883. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  884. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  885. handle_cuda(),
  886. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  887. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  888. }
  889. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_1) {
  890. require_compute_capability(7, 5);
  891. conv_bias::check_conv_bias(
  892. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  893. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  894. handle_cuda(),
  895. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  896. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  897. }
  898. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_2) {
  899. require_compute_capability(7, 5);
  900. conv_bias::check_conv_bias(
  901. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  902. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  903. handle_cuda(),
  904. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  905. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  906. }
  907. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_0) {
  908. require_compute_capability(7, 5);
  909. conv_bias::check_conv_bias(
  910. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  911. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  912. handle_cuda(),
  913. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  914. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  915. }
  916. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_1) {
  917. require_compute_capability(7, 5);
  918. conv_bias::check_conv_bias(
  919. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  920. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  921. handle_cuda(),
  922. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  923. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  924. }
  925. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_2) {
  926. require_compute_capability(7, 5);
  927. conv_bias::check_conv_bias(
  928. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  929. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  930. handle_cuda(),
  931. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  932. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  933. }
  934. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1) {
  935. require_compute_capability(7, 5);
  936. conv_bias::check_conv_bias(
  937. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  938. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  939. handle_cuda(),
  940. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  941. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(1));
  942. }
  943. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5) {
  944. require_compute_capability(7, 5);
  945. conv_bias::check_conv_bias(
  946. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  947. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  948. handle_cuda(),
  949. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  950. param::ConvBias::Format::CHWN4,
  951. conv_bias::get_int8_chwn4_args_small_batch(5));
  952. }
  953. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_7x7) {
  954. require_compute_capability(7, 5);
  955. conv_bias::check_conv_bias(
  956. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  957. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  958. handle_cuda(),
  959. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  960. param::ConvBias::Format::CHWN4,
  961. conv_bias::get_int8_chwn4_args_small_batch(7));
  962. }
  963. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_1) {
  964. require_compute_capability(7, 5);
  965. conv_bias::check_conv_bias(
  966. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  967. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  968. handle_cuda(),
  969. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  970. param::ConvBias::Format::CHWN4,
  971. conv_bias::get_int8_chwn4_args_small_batch(5));
  972. }
  973. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_2) {
  974. require_compute_capability(7, 5);
  975. conv_bias::check_conv_bias(
  976. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  977. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  978. handle_cuda(),
  979. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  980. param::ConvBias::Format::CHWN4,
  981. conv_bias::get_int8_chwn4_args_small_batch(5));
  982. }
  983. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_1) {
  984. require_compute_capability(7, 5);
  985. conv_bias::check_conv_bias(
  986. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  987. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  988. handle_cuda(),
  989. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  990. param::ConvBias::Format::CHWN4,
  991. conv_bias::get_int8_chwn4_args_small_batch(1));
  992. }
  993. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
  994. require_compute_capability(7, 5);
  995. conv_bias::check_conv_bias(
  996. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  997. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  998. handle_cuda(),
  999. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  1000. param::ConvBias::Format::CHWN4,
  1001. conv_bias::get_int8_chwn4_args_small_batch(1));
  1002. }
  1003. #if CUDA_VERSION >= 10020
  1004. /// \note: we only check several cases and block sizes in megdnn_test, the
  1005. /// full testcases are written in cutlass repository
  1006. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
  1007. require_compute_capability_eq(7, 5);
  1008. Checker<ConvBiasForward> checker(handle_cuda());
  1009. auto check = [&checker](const std::string& algo) {
  1010. checker.set_before_exec_callback(
  1011. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  1012. UniformIntRNG rng{-8, 8};
  1013. UniformIntRNG bias_rng{-50, 50};
  1014. UniformIntRNG const_rng{1, 1};
  1015. // use scale that are all integers to avoid rouding error
  1016. checker.set_rng(0, &rng)
  1017. .set_rng(1, &rng)
  1018. .set_rng(2, &bias_rng)
  1019. .set_rng(3, &rng)
  1020. .set_dtype(0, dtype::QuantizedS8{6.0f})
  1021. .set_dtype(1, dtype::QuantizedS8{1.0f})
  1022. .set_dtype(2, dtype::QuantizedS32{6.0f})
  1023. .set_dtype(3, dtype::QuantizedS8{1.0f})
  1024. .set_dtype(4, dtype::QuantizedS8{6.0f})
  1025. .set_epsilon(1e-3);
  1026. param::ConvBias param;
  1027. param.pad_h = param.pad_w = 1;
  1028. param.stride_h = param.stride_w = 1;
  1029. param.format = param::ConvBias::Format::NCHW32;
  1030. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1031. {512, 16, 3, 3, 32},
  1032. {1, 16, 1, 1, 32},
  1033. {},
  1034. {}});
  1035. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1036. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1037. {512, 16, 1, 1, 32},
  1038. {1, 16, 1, 1, 32},
  1039. {},
  1040. {}});
  1041. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  1042. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1043. {512, 16, 3, 3, 32},
  1044. {1, 16, 1, 1, 32},
  1045. {},
  1046. {}});
  1047. // use non integer scale
  1048. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  1049. checker.set_dtype(0, dtype::QuantizedS8{1.1f})
  1050. .set_dtype(1, dtype::QuantizedS8{1.2f})
  1051. .set_dtype(2, dtype::QuantizedS32{1.1f * 1.2f})
  1052. .set_dtype(3, dtype::QuantizedS8{1.1f})
  1053. .set_dtype(4, dtype::QuantizedS8{6.0f})
  1054. .set_epsilon(1 + 1e-3)
  1055. .set_max_avg_error(1e-1)
  1056. .set_max_avg_biased_error(1e-1)
  1057. .execs({{16, 16, 7, 7, 32},
  1058. {512, 16, 3, 3, 32},
  1059. {1, 16, 1, 1, 32},
  1060. {16, 16, 7, 7, 32},
  1061. {}});
  1062. };
  1063. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1064. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_256X128X64_64X64X64",
  1065. ConvBias::DirectParam{});
  1066. check(algo);
  1067. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1068. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X64X64_32X16X64",
  1069. ConvBias::DirectParam{});
  1070. check(algo);
  1071. }
  1072. #endif
  1073. #if MEGDNN_WITH_BENCHMARK
  1074. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) {
  1075. require_compute_capability(6, 1);
  1076. benchmark_target_algo(
  1077. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1078. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1079. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1080. param::ConvBias::Format::CHWN4);
  1081. }
  1082. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4) {
  1083. require_compute_capability(6, 1);
  1084. benchmark_target_algo(
  1085. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1086. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1087. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  1088. param::ConvBias::Format::NCHW4);
  1089. }
  1090. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE) {
  1091. require_compute_capability(7, 5);
  1092. benchmark_target_algo_with_cudnn_tsc(
  1093. handle_cuda(), get_resnet50_bench_args(256),
  1094. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1095. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1096. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1097. param::ConvBias::Format::CHWN4);
  1098. }
  1099. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE_ALL_ALGO) {
  1100. require_compute_capability(7, 5);
  1101. benchmark_target_algo_with_cudnn_tsc(
  1102. handle_cuda(), get_resnet50_bench_args(256),
  1103. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1104. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, nullptr,
  1105. param::ConvBias::Format::CHWN4);
  1106. }
  1107. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_DET_ALL_ALGO) {
  1108. require_compute_capability(7, 5);
  1109. benchmark_target_algo_with_cudnn_tsc(
  1110. handle_cuda(), get_detection_bench_args(), dtype::QuantizedS8{1.2f},
  1111. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1112. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  1113. }
  1114. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_TENSORCORE) {
  1115. require_compute_capability(7, 5);
  1116. benchmark_target_algo_with_cudnn_tsc(
  1117. handle_cuda(), get_resnet50_bench_args(256),
  1118. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1119. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1120. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1121. param::ConvBias::Format::NCHW4);
  1122. }
  1123. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
  1124. require_compute_capability(6, 1);
  1125. std::vector<BenchArgs> args;
  1126. args.push_back(BenchArgs{64, 4, 224, 224, 64, 7, 2});
  1127. benchmark_target_algo(
  1128. handle_cuda(), args, dtype::QuantizedS8{1.2f},
  1129. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1130. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1131. param::ConvBias::Format::CHWN4);
  1132. }
  1133. #if CUDA_VERSION >= 10020
  1134. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
  1135. require_compute_capability(7, 5);
  1136. benchmark_target_algo_with_cudnn_tsc(
  1137. handle_cuda(), get_resnet50_bench_args(256),
  1138. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1139. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1140. "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
  1141. param::ConvBias::Format::NCHW32);
  1142. }
  1143. #endif
  1144. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {
  1145. require_compute_capability(6, 1);
  1146. benchmark_target_algo(
  1147. handle_cuda(), get_resnet50_bench_args(64),
  1148. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1149. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1150. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM", param::ConvBias::Format::NCHW4);
  1151. }
  1152. TEST_F(CUDA, BENCHMARK_SASS_CONV_BIAS_INT8_NCHW4_DET_FIRST) {
  1153. require_compute_capability(6, 1);
  1154. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1155. "SASS_INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32_64",
  1156. ConvBias::DirectParam{});
  1157. benchmark_target_algo(handle_cuda(), get_det_first_bench_args(16),
  1158. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1159. dtype::QuantizedS32{1.2f * 1.3f},
  1160. dtype::QuantizedS8{1.0f}, algo.c_str(),
  1161. param::ConvBias::Format::NCHW4);
  1162. }
  1163. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4_DET_FIRST) {
  1164. require_compute_capability(6, 1);
  1165. benchmark_target_algo(
  1166. handle_cuda(), get_det_first_bench_args(16),
  1167. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1168. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1169. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16", param::ConvBias::Format::NCHW4);
  1170. }
  1171. #endif
  1172. } // namespace test
  1173. } // namespace megdnn
  1174. #undef V1
  1175. #undef V
  1176. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台