You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_int8.cpp 55 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239
  1. /**
  2. * \file dnn/test/cuda/conv_bias_int8.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "src/common/utils.h"
  13. #include "src/cuda/cudnn_with_check.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/conv_bias.h"
  16. #include "test/cuda/benchmark.h"
  17. #include "test/cuda/fixture.h"
  18. #include "test/cuda/utils.h"
  19. #define MEGDNN_WITH_BENCHMARK 1
  20. #define V1(x) #x
  21. #define V(x) V1(x)
  22. namespace megdnn {
  23. namespace test {
  24. namespace {
  25. #if MEGDNN_WITH_BENCHMARK
  26. struct BenchArgs {
  27. size_t n, ci, hi, wi, co, f, s;
  28. };
  29. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
  30. std::vector<BenchArgs> args;
  31. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  32. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  33. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  34. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  35. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  36. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  37. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  38. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  39. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  40. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  41. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  42. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  43. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  44. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  45. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  46. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  47. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  48. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  49. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  50. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  51. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  52. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  53. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  54. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  55. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  56. return args;
  57. }
  58. std::vector<BenchArgs> get_detection_bench_args(size_t batch = 16) {
  59. std::vector<BenchArgs> args;
  60. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  61. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  62. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  63. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  64. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  66. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  67. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  68. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  69. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  70. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  71. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  72. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  73. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  74. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  75. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  76. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  77. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  78. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  79. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  80. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  81. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  82. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  83. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  84. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  85. return args;
  86. }
  87. void benchmark_target_algo(
  88. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  89. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  90. const char* algo = nullptr,
  91. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  92. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  93. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  94. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  95. size_t RUNS = 1000;
  96. benchmarker.set_display(false).set_times(RUNS);
  97. benchmarker_cudnn.set_display(false).set_times(RUNS);
  98. #define CUDNN_VERSION_STRING \
  99. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  100. benchmarker_cudnn.set_before_exec_callback(
  101. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  102. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  103. "ALGO_IMPLICIT_PRECOMP_"
  104. "GEMM" CUDNN_VERSION_STRING));
  105. benchmarker.set_dtype(0, src_dtype)
  106. .set_dtype(1, filter_dtype)
  107. .set_dtype(2, bias_dtype)
  108. .set_dtype(3, dst_dtype)
  109. .set_dtype(4, dst_dtype);
  110. benchmarker_cudnn.set_dtype(0, src_dtype)
  111. .set_dtype(1, filter_dtype)
  112. .set_dtype(2, bias_dtype)
  113. .set_dtype(3, dst_dtype)
  114. .set_dtype(4, dst_dtype);
  115. using Param = ConvBias::Param;
  116. using Format = Param::Format;
  117. // helper function to change format
  118. auto get_tensor_shape = [](TensorShape shape,
  119. Format format) -> TensorShape {
  120. TensorShape ret;
  121. if (format == Format::NCHW4) {
  122. ret = static_cast<TensorShape>(
  123. TensorLayout{shape, dtype::Int8()}
  124. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  125. shape[3]})
  126. .dimshuffle({0, 1, 3, 4, 2}));
  127. } else if (format == Format::CHWN4) {
  128. ret = static_cast<TensorShape>(
  129. TensorLayout{shape, dtype::Int8()}
  130. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  131. shape[3]})
  132. .dimshuffle({1, 3, 4, 0, 2}));
  133. }
  134. return ret;
  135. };
  136. for (auto&& arg : args) {
  137. Param param;
  138. param.pad_h = param.pad_w = arg.f / 2;
  139. param.stride_h = param.stride_w = arg.s;
  140. param.format = format;
  141. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  142. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  143. benchmarker.set_param(param);
  144. if (!algo) {
  145. benchmarker.proxy()->target_algo = nullptr;
  146. }
  147. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  148. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  149. z{arg.n, arg.co, ho, wo}, dst = z;
  150. float time_in_ms = 0.f;
  151. if (algo) {
  152. time_in_ms =
  153. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  154. CUTimer>(benchmarker,
  155. {get_tensor_shape(src, format),
  156. get_tensor_shape(filter, format),
  157. get_tensor_shape(bias, format),
  158. {},
  159. {}},
  160. algo) /
  161. RUNS;
  162. } else {
  163. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  164. get_tensor_shape(filter, format),
  165. get_tensor_shape(bias, format),
  166. {},
  167. {}}) /
  168. RUNS;
  169. }
  170. Format format_cudnn = Format::NCHW4;
  171. param.format = format_cudnn;
  172. benchmarker_cudnn.set_param(param);
  173. auto time_in_ms_cudnn =
  174. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  175. get_tensor_shape(filter, format_cudnn),
  176. get_tensor_shape(bias, format_cudnn),
  177. {},
  178. {}}) /
  179. RUNS;
  180. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  181. (1e12);
  182. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  183. "time(cudnn)=%.2f %.2fTops, "
  184. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  185. src.to_string().c_str(), filter.to_string().c_str(),
  186. dst.to_string().c_str(), algo, time_in_ms,
  187. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  188. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  189. time_in_ms_cudnn / time_in_ms);
  190. printf("bench with z tensor\n");
  191. if (algo) {
  192. time_in_ms =
  193. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  194. CUTimer>(benchmarker,
  195. {get_tensor_shape(src, format),
  196. get_tensor_shape(filter, format),
  197. get_tensor_shape(bias, format),
  198. get_tensor_shape(z, format),
  199. {}},
  200. algo) /
  201. RUNS;
  202. } else {
  203. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  204. get_tensor_shape(filter, format),
  205. get_tensor_shape(bias, format),
  206. get_tensor_shape(z, format),
  207. {}}) /
  208. RUNS;
  209. }
  210. time_in_ms_cudnn =
  211. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  212. get_tensor_shape(filter, format_cudnn),
  213. get_tensor_shape(bias, format_cudnn),
  214. get_tensor_shape(z, format_cudnn),
  215. {}}) /
  216. RUNS;
  217. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  218. "time(cudnn)=%.2f %.2fTops, "
  219. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  220. src.to_string().c_str(), filter.to_string().c_str(),
  221. dst.to_string().c_str(), algo, time_in_ms,
  222. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  223. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  224. time_in_ms_cudnn / time_in_ms);
  225. }
  226. }
  227. void benchmark_target_algo_with_cudnn_tsc(
  228. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  229. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  230. const char* algo = nullptr,
  231. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  232. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  233. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  234. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  235. size_t RUNS = 1000;
  236. benchmarker.set_display(false).set_times(RUNS);
  237. benchmarker_cudnn.set_display(false).set_times(RUNS);
  238. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  239. new OprProxy<ConvBiasForward>{true}};
  240. if (!algo) {
  241. benchmarker.set_proxy(proxy);
  242. }
  243. benchmarker_cudnn.set_before_exec_callback(
  244. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  245. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  246. "ALGO_IMPLICIT_PRECOMP_"
  247. "GEMM" CUDNN_VERSION_STRING));
  248. #undef CUDNN_VERSION_STRING
  249. benchmarker.set_dtype(0, src_dtype)
  250. .set_dtype(1, filter_dtype)
  251. .set_dtype(2, bias_dtype)
  252. .set_dtype(3, dst_dtype)
  253. .set_dtype(4, dst_dtype);
  254. benchmarker_cudnn.set_dtype(0, src_dtype)
  255. .set_dtype(1, filter_dtype)
  256. .set_dtype(2, bias_dtype)
  257. .set_dtype(3, dst_dtype)
  258. .set_dtype(4, dst_dtype);
  259. using Param = ConvBias::Param;
  260. using Format = Param::Format;
  261. // helper function to change format
  262. auto get_tensor_shape = [](TensorShape shape,
  263. Format format) -> TensorShape {
  264. TensorShape ret;
  265. if (format == Format::NCHW4) {
  266. ret = static_cast<TensorShape>(
  267. TensorLayout{shape, dtype::Int8()}
  268. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  269. shape[3]})
  270. .dimshuffle({0, 1, 3, 4, 2}));
  271. } else if (format == Format::NCHW32) {
  272. ret = static_cast<TensorShape>(
  273. TensorLayout{shape, dtype::Int8()}
  274. .reshape({shape[0], shape[1] / 32, 32, shape[2],
  275. shape[3]})
  276. .dimshuffle({0, 1, 3, 4, 2}));
  277. } else if (format == Format::CHWN4) {
  278. ret = static_cast<TensorShape>(
  279. TensorLayout{shape, dtype::Int8()}
  280. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  281. shape[3]})
  282. .dimshuffle({1, 3, 4, 0, 2}));
  283. }
  284. return ret;
  285. };
  286. for (auto&& arg : args) {
  287. Param param;
  288. param.pad_h = param.pad_w = arg.f / 2;
  289. param.stride_h = param.stride_w = arg.s;
  290. param.format = format;
  291. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  292. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  293. benchmarker.set_param(param);
  294. if (!algo) {
  295. benchmarker.proxy()->target_algo = nullptr;
  296. }
  297. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  298. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  299. z{arg.n, arg.co, ho, wo}, dst = z;
  300. // skip testcase which cannot enable nchw32 tensorcore
  301. if (format == Format::NCHW32 && (arg.co % 32 != 0 || arg.ci % 32 != 0))
  302. continue;
  303. // skip testcase which cannot enable nchw4/chwn4 tensorcore
  304. if ((format == Format::CHWN4 || format == Format::NCHW4) &&
  305. (arg.ci % 16 != 0))
  306. continue;
  307. float time_in_ms = 0.f;
  308. if (algo) {
  309. time_in_ms =
  310. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  311. CUTimer>(benchmarker,
  312. {get_tensor_shape(src, format),
  313. get_tensor_shape(filter, format),
  314. get_tensor_shape(bias, format),
  315. {},
  316. {}},
  317. algo) /
  318. RUNS;
  319. } else {
  320. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  321. get_tensor_shape(filter, format),
  322. get_tensor_shape(bias, format),
  323. {},
  324. {}}) /
  325. RUNS;
  326. }
  327. Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
  328. ? Format::NCHW32
  329. : Format::NCHW4;
  330. param.format = format_cudnn;
  331. benchmarker_cudnn.set_param(param);
  332. auto time_in_ms_cudnn =
  333. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  334. get_tensor_shape(filter, format_cudnn),
  335. get_tensor_shape(bias, format_cudnn),
  336. {},
  337. {}}) /
  338. RUNS;
  339. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  340. (1e12);
  341. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  342. "time(cudnn)=%.2f %.2fTops, "
  343. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  344. src.to_string().c_str(), filter.to_string().c_str(),
  345. dst.to_string().c_str(), algo, time_in_ms,
  346. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  347. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  348. time_in_ms_cudnn / time_in_ms);
  349. printf("bench with z tensor\n");
  350. if (algo) {
  351. time_in_ms =
  352. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  353. CUTimer>(benchmarker,
  354. {get_tensor_shape(src, format),
  355. get_tensor_shape(filter, format),
  356. get_tensor_shape(bias, format),
  357. get_tensor_shape(z, format),
  358. {}},
  359. algo) /
  360. RUNS;
  361. } else {
  362. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  363. get_tensor_shape(filter, format),
  364. get_tensor_shape(bias, format),
  365. get_tensor_shape(z, format),
  366. {}}) /
  367. RUNS;
  368. }
  369. time_in_ms_cudnn =
  370. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  371. get_tensor_shape(filter, format_cudnn),
  372. get_tensor_shape(bias, format_cudnn),
  373. get_tensor_shape(z, format_cudnn),
  374. {}}) /
  375. RUNS;
  376. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  377. "time(cudnn)=%.2f %.2fTops, "
  378. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  379. src.to_string().c_str(), filter.to_string().c_str(),
  380. dst.to_string().c_str(), algo, time_in_ms,
  381. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  382. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  383. time_in_ms_cudnn / time_in_ms);
  384. }
  385. }
  386. #endif
  387. } // namespace
  388. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) {
  389. require_compute_capability(6, 1);
  390. conv_bias::check_conv_bias(
  391. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  392. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  393. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  394. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(1));
  395. }
  396. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_3x3) {
  397. require_compute_capability(6, 1);
  398. conv_bias::check_conv_bias(
  399. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  400. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  401. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  402. param::ConvBias::Format::NCHW4);
  403. }
  404. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_5x5) {
  405. require_compute_capability(6, 1);
  406. conv_bias::check_conv_bias(
  407. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  408. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  409. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  410. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(5));
  411. }
  412. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_7x7) {
  413. require_compute_capability(6, 1);
  414. conv_bias::check_conv_bias(
  415. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  416. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  417. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  418. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(7));
  419. }
  420. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_WITH_Z) {
  421. require_compute_capability(6, 1);
  422. Checker<ConvBiasForward> checker(handle_cuda());
  423. checker.set_before_exec_callback(
  424. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  425. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  426. UniformIntRNG rng{-3, 3};
  427. UniformIntRNG bias_rng{-50, 50};
  428. checker.set_rng(0, &rng)
  429. .set_rng(1, &rng)
  430. .set_rng(2, &bias_rng)
  431. .set_rng(3, &rng)
  432. .set_dtype(0, dtype::QuantizedS8{1.2f})
  433. .set_dtype(1, dtype::QuantizedS8{1.3f})
  434. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  435. .set_dtype(3, dtype::QuantizedS8{1.1f})
  436. .set_dtype(4, dtype::QuantizedS8{1.0f})
  437. .set_epsilon(1 + 1e-3)
  438. .set_max_avg_error(1e-1)
  439. .set_max_avg_biased_error(1e-1);
  440. param::ConvBias param;
  441. param.pad_h = param.pad_w = 1;
  442. param.stride_h = param.stride_w = 1;
  443. param.format = param::ConvBias::Format::NCHW4;
  444. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  445. {16, 4, 3, 3, 4},
  446. {1, 4, 1, 1, 4},
  447. {32, 4, 12, 12, 4},
  448. {}});
  449. }
  450. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_STRIDE2_WITH_Z) {
  451. require_compute_capability(6, 1);
  452. Checker<ConvBiasForward> checker(handle_cuda());
  453. checker.set_before_exec_callback(
  454. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  455. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  456. UniformIntRNG rng{-3, 3};
  457. UniformIntRNG bias_rng{-50, 50};
  458. checker.set_rng(0, &rng)
  459. .set_rng(1, &rng)
  460. .set_rng(2, &bias_rng)
  461. .set_rng(3, &rng)
  462. .set_dtype(0, dtype::QuantizedS8{1.2f})
  463. .set_dtype(1, dtype::QuantizedS8{1.3f})
  464. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  465. .set_dtype(3, dtype::QuantizedS8{1.1f})
  466. .set_dtype(4, dtype::QuantizedS8{1.0f})
  467. .set_epsilon(1 + 1e-3)
  468. .set_max_avg_error(1e-1)
  469. .set_max_avg_biased_error(1e-1);
  470. param::ConvBias param;
  471. param.pad_h = param.pad_w = 1;
  472. param.stride_h = param.stride_w = 2;
  473. param.format = param::ConvBias::Format::NCHW4;
  474. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  475. {16, 4, 3, 3, 4},
  476. {1, 4, 1, 1, 4},
  477. {32, 4, 6, 6, 4},
  478. {}});
  479. }
  480. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_1x1) {
  481. require_compute_capability(6, 1);
  482. conv_bias::check_conv_bias(
  483. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  484. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  485. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  486. param::ConvBias::Format::NCHW4,
  487. conv_bias::get_int8_nchw4_args_check_bounds(1));
  488. }
  489. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_3x3) {
  490. require_compute_capability(6, 1);
  491. conv_bias::check_conv_bias(
  492. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  493. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  494. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  495. param::ConvBias::Format::NCHW4,
  496. conv_bias::get_int8_nchw4_args_check_bounds(3));
  497. }
  498. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_5x5) {
  499. require_compute_capability(6, 1);
  500. conv_bias::check_conv_bias(
  501. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  502. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  503. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  504. param::ConvBias::Format::NCHW4,
  505. conv_bias::get_int8_nchw4_args_check_bounds(5));
  506. }
  507. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_7x7) {
  508. require_compute_capability(6, 1);
  509. conv_bias::check_conv_bias(
  510. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  511. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  512. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  513. param::ConvBias::Format::NCHW4,
  514. conv_bias::get_int8_nchw4_args_check_bounds(7));
  515. }
  516. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4) {
  517. require_compute_capability(6, 1);
  518. conv_bias::check_conv_bias(
  519. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  520. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  521. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  522. param::ConvBias::Format::CHWN4);
  523. }
  524. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_WITH_Z) {
  525. require_compute_capability(6, 1);
  526. Checker<ConvBiasForward> checker(handle_cuda());
  527. checker.set_before_exec_callback(
  528. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  529. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  530. UniformIntRNG rng{-3, 3};
  531. UniformIntRNG bias_rng{-50, 50};
  532. checker.set_rng(0, &rng)
  533. .set_rng(1, &rng)
  534. .set_rng(2, &bias_rng)
  535. .set_rng(3, &rng)
  536. .set_dtype(0, dtype::QuantizedS8{1.2f})
  537. .set_dtype(1, dtype::QuantizedS8{1.3f})
  538. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  539. .set_dtype(3, dtype::QuantizedS8{1.1f})
  540. .set_dtype(4, dtype::QuantizedS8{1.1f})
  541. .set_epsilon(1 + 1e-3)
  542. .set_max_avg_error(1e-1)
  543. .set_max_avg_biased_error(1e-1);
  544. param::ConvBias param;
  545. param.pad_h = param.pad_w = 1;
  546. param.stride_h = param.stride_w = 1;
  547. param.format = param::ConvBias::Format::CHWN4;
  548. checker.set_param(param).execs({{4, 12, 12, 32, 4},
  549. {4, 3, 3, 16, 4},
  550. {4, 1, 1, 1, 4},
  551. {4, 12, 12, 32, 4},
  552. {}});
  553. }
  554. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
  555. require_compute_capability(6, 1);
  556. Checker<ConvBiasForward> checker(handle_cuda());
  557. checker.set_before_exec_callback(
  558. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  559. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  560. UniformIntRNG rng{-3, 3};
  561. UniformIntRNG bias_rng{-50, 50};
  562. checker.set_rng(0, &rng)
  563. .set_rng(1, &rng)
  564. .set_rng(2, &bias_rng)
  565. .set_rng(3, &rng)
  566. .set_dtype(0, dtype::QuantizedS8{1.2f})
  567. .set_dtype(1, dtype::QuantizedS8{1.3f})
  568. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  569. .set_dtype(4, dtype::QuantizedS8{0.001f})
  570. .set_epsilon(1 + 1e-3)
  571. .set_max_avg_error(1e-1)
  572. .set_max_avg_biased_error(1e-1);
  573. param::ConvBias param;
  574. param.pad_h = param.pad_w = 1;
  575. param.stride_h = param.stride_w = 1;
  576. param.format = param::ConvBias::Format::CHWN4;
  577. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  578. checker.set_param(param).execs({{4, 12, 12, 32, 4},
  579. {4, 3, 3, 16, 4},
  580. {4, 1, 1, 1, 4},
  581. {},
  582. {}});
  583. }
  584. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
  585. require_compute_capability(6, 1);
  586. conv_bias::check_conv_bias(
  587. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  588. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  589. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  590. param::ConvBias::Format::CHWN4,
  591. conv_bias::get_int8_chwn4_args_check_bounds(3));
  592. }
  593. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1) {
  594. require_compute_capability(6, 1);
  595. conv_bias::check_conv_bias(
  596. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  597. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  598. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  599. param::ConvBias::Format::CHWN4,
  600. conv_bias::get_int8_chwn4_small_channel_args(1));
  601. }
  602. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_3x3) {
  603. require_compute_capability(6, 1);
  604. conv_bias::check_conv_bias(
  605. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  606. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  607. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  608. param::ConvBias::Format::CHWN4,
  609. conv_bias::get_int8_chwn4_small_channel_args(3));
  610. }
  611. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5) {
  612. require_compute_capability(6, 1);
  613. conv_bias::check_conv_bias(
  614. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  615. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  616. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  617. param::ConvBias::Format::CHWN4,
  618. conv_bias::get_int8_chwn4_small_channel_args(5));
  619. }
  620. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7) {
  621. require_compute_capability(6, 1);
  622. conv_bias::check_conv_bias(
  623. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  624. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  625. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  626. param::ConvBias::Format::CHWN4,
  627. conv_bias::get_int8_chwn4_small_channel_args(7));
  628. }
  629. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_SMALL_CHANNEL_CHECK_BOUNDS) {
  630. require_compute_capability(6, 1);
  631. conv_bias::check_conv_bias(
  632. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  633. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  634. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  635. param::ConvBias::Format::NCHW4,
  636. conv_bias::get_int8_nchw4_small_channel_args_check_bounds(3));
  637. }
  638. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1_CHECK_BOUNDS) {
  639. require_compute_capability(6, 1);
  640. conv_bias::check_conv_bias(
  641. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  642. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  643. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  644. param::ConvBias::Format::CHWN4,
  645. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(1));
  646. }
  647. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5_CHECK_BOUNDS) {
  648. require_compute_capability(6, 1);
  649. conv_bias::check_conv_bias(
  650. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  651. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  652. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  653. param::ConvBias::Format::CHWN4,
  654. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(5));
  655. }
  656. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7_CHECK_BOUNDS) {
  657. require_compute_capability(6, 1);
  658. conv_bias::check_conv_bias(
  659. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  660. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  661. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  662. param::ConvBias::Format::CHWN4,
  663. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(7));
  664. }
  665. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_1x1) {
  666. require_compute_capability(7, 5);
  667. conv_bias::check_conv_bias(
  668. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  669. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  670. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  671. param::ConvBias::Format::NCHW4,
  672. conv_bias::get_int8_nchw4_tensorcore_args(1));
  673. }
  674. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_3x3) {
  675. require_compute_capability(7, 5);
  676. conv_bias::check_conv_bias(
  677. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  678. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  679. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  680. param::ConvBias::Format::NCHW4,
  681. conv_bias::get_int8_nchw4_tensorcore_args(3));
  682. }
  683. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_5x5) {
  684. require_compute_capability(7, 5);
  685. conv_bias::check_conv_bias(
  686. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  687. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  688. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  689. param::ConvBias::Format::NCHW4,
  690. conv_bias::get_int8_nchw4_tensorcore_args(5));
  691. }
  692. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_7x7) {
  693. require_compute_capability(7, 5);
  694. conv_bias::check_conv_bias(
  695. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  696. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  697. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  698. param::ConvBias::Format::NCHW4,
  699. conv_bias::get_int8_nchw4_tensorcore_args(7));
  700. }
  701. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  702. require_compute_capability(7, 5);
  703. conv_bias::check_conv_bias(
  704. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  705. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  706. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  707. param::ConvBias::Format::NCHW4,
  708. conv_bias::get_int8_nchw4_args_check_bounds(3));
  709. }
  710. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  711. require_compute_capability(7, 5);
  712. conv_bias::check_conv_bias(
  713. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  714. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  715. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  716. param::ConvBias::Format::NCHW4,
  717. conv_bias::get_int8_nchw4_args_check_bounds(3));
  718. }
  719. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  720. require_compute_capability(7, 5);
  721. conv_bias::check_conv_bias(
  722. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  723. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  724. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  725. param::ConvBias::Format::NCHW4,
  726. conv_bias::get_int8_nchw4_args_check_bounds(3));
  727. }
  728. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_0) {
  729. require_compute_capability(7, 5);
  730. conv_bias::check_conv_bias(
  731. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  732. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  733. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  734. param::ConvBias::Format::CHWN4,
  735. conv_bias::get_int8_chwn4_tensorcore_args(3));
  736. }
  737. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_1) {
  738. require_compute_capability(7, 5);
  739. conv_bias::check_conv_bias(
  740. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  741. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  742. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  743. param::ConvBias::Format::CHWN4,
  744. conv_bias::get_int8_chwn4_tensorcore_args(3));
  745. }
  746. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_2) {
  747. require_compute_capability(7, 5);
  748. conv_bias::check_conv_bias(
  749. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  750. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  751. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  752. param::ConvBias::Format::CHWN4,
  753. conv_bias::get_int8_chwn4_tensorcore_args(3));
  754. }
  755. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_1x1) {
  756. require_compute_capability(7, 5);
  757. conv_bias::check_conv_bias(
  758. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  759. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  760. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  761. param::ConvBias::Format::CHWN4,
  762. conv_bias::get_int8_chwn4_args_check_bounds(1));
  763. }
  764. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_5x5) {
  765. require_compute_capability(7, 5);
  766. conv_bias::check_conv_bias(
  767. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  768. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  769. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  770. param::ConvBias::Format::CHWN4,
  771. conv_bias::get_int8_chwn4_args_check_bounds(5));
  772. }
  773. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_7x7) {
  774. require_compute_capability(7, 5);
  775. conv_bias::check_conv_bias(
  776. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  777. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  778. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  779. param::ConvBias::Format::CHWN4,
  780. conv_bias::get_int8_chwn4_args_check_bounds(7));
  781. }
  782. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_WITH_Z) {
  783. require_compute_capability(7, 5);
  784. Checker<ConvBiasForward> checker(handle_cuda());
  785. checker.set_before_exec_callback(
  786. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  787. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  788. UniformIntRNG rng{-3, 3};
  789. UniformIntRNG bias_rng{-50, 50};
  790. checker.set_rng(0, &rng)
  791. .set_rng(1, &rng)
  792. .set_rng(2, &bias_rng)
  793. .set_rng(3, &rng)
  794. .set_dtype(0, dtype::QuantizedS8{1.2f})
  795. .set_dtype(1, dtype::QuantizedS8{1.3f})
  796. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  797. .set_dtype(3, dtype::QuantizedS8{1.1f})
  798. .set_dtype(4, dtype::QuantizedS8{1.0f})
  799. .set_epsilon(1 + 1e-3)
  800. .set_max_avg_error(1e-1)
  801. .set_max_avg_biased_error(1e-1);
  802. param::ConvBias param;
  803. param.pad_h = param.pad_w = 1;
  804. param.stride_h = param.stride_w = 1;
  805. param.format = param::ConvBias::Format::NCHW4;
  806. checker.set_param(param).execs({{64, 8, 12, 12, 4},
  807. {64, 8, 3, 3, 4},
  808. {1, 16, 1, 1, 4},
  809. {64, 16, 12, 12, 4},
  810. {}});
  811. }
  812. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_WITH_Z) {
  813. require_compute_capability(7, 5);
  814. Checker<ConvBiasForward> checker(handle_cuda());
  815. checker.set_before_exec_callback(
  816. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  817. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  818. UniformIntRNG rng{-3, 3};
  819. UniformIntRNG bias_rng{-50, 50};
  820. checker.set_rng(0, &rng)
  821. .set_rng(1, &rng)
  822. .set_rng(2, &bias_rng)
  823. .set_rng(3, &rng)
  824. .set_dtype(0, dtype::QuantizedS8{1.2f})
  825. .set_dtype(1, dtype::QuantizedS8{1.3f})
  826. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  827. .set_dtype(3, dtype::QuantizedS8{1.1f})
  828. .set_dtype(4, dtype::QuantizedS8{1.0f})
  829. .set_epsilon(1 + 1e-3)
  830. .set_max_avg_error(1e-1)
  831. .set_max_avg_biased_error(1e-1);
  832. param::ConvBias param;
  833. param.pad_h = param.pad_w = 1;
  834. param.stride_h = param.stride_w = 1;
  835. param.format = param::ConvBias::Format::CHWN4;
  836. checker.set_param(param).execs({{8, 12, 12, 64, 4},
  837. {8, 3, 3, 64, 4},
  838. {16, 1, 1, 1, 4},
  839. {16, 12, 12, 64, 4},
  840. {}});
  841. }
  842. TEST_F(CUDA,
  843. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  844. require_compute_capability(7, 5);
  845. conv_bias::check_conv_bias(
  846. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  847. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  848. handle_cuda(),
  849. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  850. param::ConvBias::Format::CHWN4,
  851. conv_bias::get_int8_chwn4_args_check_bounds(3));
  852. }
  853. TEST_F(CUDA,
  854. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  855. require_compute_capability(7, 5);
  856. conv_bias::check_conv_bias(
  857. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  858. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  859. handle_cuda(),
  860. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  861. param::ConvBias::Format::CHWN4,
  862. conv_bias::get_int8_chwn4_args_check_bounds(3));
  863. }
  864. TEST_F(CUDA,
  865. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  866. require_compute_capability(7, 5);
  867. conv_bias::check_conv_bias(
  868. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  869. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  870. handle_cuda(),
  871. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  872. param::ConvBias::Format::CHWN4,
  873. conv_bias::get_int8_chwn4_args_check_bounds(3));
  874. }
  875. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_0) {
  876. require_compute_capability(7, 5);
  877. conv_bias::check_conv_bias(
  878. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  879. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  880. handle_cuda(),
  881. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  882. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  883. }
  884. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_1) {
  885. require_compute_capability(7, 5);
  886. conv_bias::check_conv_bias(
  887. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  888. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  889. handle_cuda(),
  890. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  891. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  892. }
  893. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_2) {
  894. require_compute_capability(7, 5);
  895. conv_bias::check_conv_bias(
  896. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  897. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  898. handle_cuda(),
  899. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  900. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  901. }
  902. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_0) {
  903. require_compute_capability(7, 5);
  904. conv_bias::check_conv_bias(
  905. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  906. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  907. handle_cuda(),
  908. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  909. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  910. }
  911. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_1) {
  912. require_compute_capability(7, 5);
  913. conv_bias::check_conv_bias(
  914. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  915. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  916. handle_cuda(),
  917. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  918. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  919. }
  920. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_2) {
  921. require_compute_capability(7, 5);
  922. conv_bias::check_conv_bias(
  923. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  924. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  925. handle_cuda(),
  926. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  927. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  928. }
  929. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1) {
  930. require_compute_capability(7, 5);
  931. conv_bias::check_conv_bias(
  932. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  933. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  934. handle_cuda(),
  935. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  936. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(1));
  937. }
  938. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5) {
  939. require_compute_capability(7, 5);
  940. conv_bias::check_conv_bias(
  941. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  942. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  943. handle_cuda(),
  944. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  945. param::ConvBias::Format::CHWN4,
  946. conv_bias::get_int8_chwn4_args_small_batch(5));
  947. }
  948. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_7x7) {
  949. require_compute_capability(7, 5);
  950. conv_bias::check_conv_bias(
  951. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  952. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  953. handle_cuda(),
  954. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  955. param::ConvBias::Format::CHWN4,
  956. conv_bias::get_int8_chwn4_args_small_batch(7));
  957. }
  958. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_1) {
  959. require_compute_capability(7, 5);
  960. conv_bias::check_conv_bias(
  961. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  962. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  963. handle_cuda(),
  964. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  965. param::ConvBias::Format::CHWN4,
  966. conv_bias::get_int8_chwn4_args_small_batch(5));
  967. }
  968. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_2) {
  969. require_compute_capability(7, 5);
  970. conv_bias::check_conv_bias(
  971. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  972. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  973. handle_cuda(),
  974. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  975. param::ConvBias::Format::CHWN4,
  976. conv_bias::get_int8_chwn4_args_small_batch(5));
  977. }
  978. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_1) {
  979. require_compute_capability(7, 5);
  980. conv_bias::check_conv_bias(
  981. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  982. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  983. handle_cuda(),
  984. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  985. param::ConvBias::Format::CHWN4,
  986. conv_bias::get_int8_chwn4_args_small_batch(1));
  987. }
  988. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
  989. require_compute_capability(7, 5);
  990. conv_bias::check_conv_bias(
  991. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  992. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  993. handle_cuda(),
  994. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  995. param::ConvBias::Format::CHWN4,
  996. conv_bias::get_int8_chwn4_args_small_batch(1));
  997. }
  998. #if CUDA_VERSION >= 10020
  999. /// \note: we only check several cases and block sizes in megdnn_test, the full
  1000. /// testcases are written in cutlass repository
  1001. TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
  1002. require_compute_capability_eq(7, 5);
  1003. Checker<ConvBiasForward> checker(handle_cuda());
  1004. auto check = [&checker](const std::string& algo) {
  1005. checker.set_before_exec_callback(
  1006. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
  1007. UniformIntRNG rng{-8, 8};
  1008. UniformIntRNG bias_rng{-50, 50};
  1009. UniformIntRNG const_rng{1, 1};
  1010. // use scale that are all integers to avoid rouding error
  1011. checker.set_rng(0, &rng)
  1012. .set_rng(1, &rng)
  1013. .set_rng(2, &bias_rng)
  1014. .set_rng(3, &rng)
  1015. .set_dtype(0, dtype::QuantizedS8{6.0f})
  1016. .set_dtype(1, dtype::QuantizedS8{1.0f})
  1017. .set_dtype(2, dtype::QuantizedS32{6.0f})
  1018. .set_dtype(3, dtype::QuantizedS8{1.0f})
  1019. .set_dtype(4, dtype::QuantizedS8{6.0f})
  1020. .set_epsilon(1e-3);
  1021. param::ConvBias param;
  1022. param.pad_h = param.pad_w = 1;
  1023. param.stride_h = param.stride_w = 1;
  1024. param.format = param::ConvBias::Format::NCHW32;
  1025. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1026. {512, 16, 3, 3, 32},
  1027. {1, 16, 1, 1, 32},
  1028. {},
  1029. {}});
  1030. param.nonlineMode = param::ConvBias::NonlineMode::RELU;
  1031. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1032. {512, 16, 1, 1, 32},
  1033. {1, 16, 1, 1, 32},
  1034. {},
  1035. {}});
  1036. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  1037. checker.set_param(param).execs({{16, 16, 7, 7, 32},
  1038. {512, 16, 3, 3, 32},
  1039. {1, 16, 1, 1, 32},
  1040. {},
  1041. {}});
  1042. // use non integer scale
  1043. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  1044. checker.set_dtype(0, dtype::QuantizedS8{1.1f})
  1045. .set_dtype(1, dtype::QuantizedS8{1.2f})
  1046. .set_dtype(2, dtype::QuantizedS32{1.1f * 1.2f})
  1047. .set_dtype(3, dtype::QuantizedS8{1.1f})
  1048. .set_dtype(4, dtype::QuantizedS8{6.0f})
  1049. .set_epsilon(1 + 1e-3)
  1050. .set_max_avg_error(1e-1)
  1051. .set_max_avg_biased_error(1e-1)
  1052. .execs({{16, 16, 7, 7, 32},
  1053. {512, 16, 3, 3, 32},
  1054. {1, 16, 1, 1, 32},
  1055. {16, 16, 7, 7, 32},
  1056. {}});
  1057. };
  1058. std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1059. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_256X128X64_64X64X64",
  1060. ConvBias::DirectParam{});
  1061. check(algo);
  1062. algo = ConvBias::algo_name<ConvBias::DirectParam>(
  1063. "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X64X64_32X16X64",
  1064. ConvBias::DirectParam{});
  1065. check(algo);
  1066. }
  1067. #endif
  1068. #if MEGDNN_WITH_BENCHMARK
  1069. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) {
  1070. require_compute_capability(6, 1);
  1071. benchmark_target_algo(
  1072. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1073. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1074. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1075. param::ConvBias::Format::CHWN4);
  1076. }
  1077. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4) {
  1078. require_compute_capability(6, 1);
  1079. benchmark_target_algo(
  1080. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1081. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1082. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  1083. param::ConvBias::Format::NCHW4);
  1084. }
  1085. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE) {
  1086. require_compute_capability(7, 5);
  1087. benchmark_target_algo_with_cudnn_tsc(
  1088. handle_cuda(), get_resnet50_bench_args(256),
  1089. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1090. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1091. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1092. param::ConvBias::Format::CHWN4);
  1093. }
  1094. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE_ALL_ALGO) {
  1095. require_compute_capability(7, 5);
  1096. benchmark_target_algo_with_cudnn_tsc(
  1097. handle_cuda(), get_resnet50_bench_args(256),
  1098. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1099. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, nullptr,
  1100. param::ConvBias::Format::CHWN4);
  1101. }
  1102. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_DET_ALL_ALGO) {
  1103. require_compute_capability(7, 5);
  1104. benchmark_target_algo_with_cudnn_tsc(
  1105. handle_cuda(), get_detection_bench_args(), dtype::QuantizedS8{1.2f},
  1106. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1107. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  1108. }
  1109. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_TENSORCORE) {
  1110. require_compute_capability(7, 5);
  1111. benchmark_target_algo_with_cudnn_tsc(
  1112. handle_cuda(), get_resnet50_bench_args(256),
  1113. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1114. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1115. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1116. param::ConvBias::Format::NCHW4);
  1117. }
  1118. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
  1119. require_compute_capability(6, 1);
  1120. std::vector<BenchArgs> args;
  1121. args.push_back(BenchArgs{64, 4, 224, 224, 64, 7, 2});
  1122. benchmark_target_algo(
  1123. handle_cuda(), args, dtype::QuantizedS8{1.2f},
  1124. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1125. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1126. param::ConvBias::Format::CHWN4);
  1127. }
  1128. #if CUDA_VERSION >= 10020
  1129. TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
  1130. require_compute_capability(7, 5);
  1131. benchmark_target_algo_with_cudnn_tsc(
  1132. handle_cuda(), get_resnet50_bench_args(256),
  1133. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1134. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1135. "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
  1136. param::ConvBias::Format::NCHW32);
  1137. }
  1138. #endif
  1139. #endif
  1140. } // namespace test
  1141. } // namespace megdnn
  1142. #undef V1
  1143. #undef V
  1144. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台