You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_bias_int8.cpp 55 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244
  1. /**
  2. * \file dnn/test/cuda/conv_bias_int8.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megdnn/oprs/nn.h"
  12. #include "src/common/utils.h"
  13. #include "src/cuda/cudnn_with_check.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/conv_bias.h"
  16. #include "test/cuda/benchmark.h"
  17. #include "test/cuda/fixture.h"
  18. #include "test/cuda/utils.h"
  19. #define V1(x) #x
  20. #define V(x) V1(x)
  21. namespace megdnn {
  22. namespace test {
  23. namespace {
  24. #if MEGDNN_WITH_BENCHMARK
  25. struct BenchArgs {
  26. size_t n, ci, hi, wi, co, f, s;
  27. };
  28. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch = 64) {
  29. std::vector<BenchArgs> args;
  30. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  31. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  32. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  33. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  34. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  35. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  36. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  37. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  38. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  39. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  40. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  41. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  42. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  43. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  44. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  45. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  46. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  47. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  48. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  49. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  50. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  51. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  52. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  53. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  54. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  55. return args;
  56. }
  57. std::vector<BenchArgs> get_detection_bench_args(size_t batch = 16) {
  58. std::vector<BenchArgs> args;
  59. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  60. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  61. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  62. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  63. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  64. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  66. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  67. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  68. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  69. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  70. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  71. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  72. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  73. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  74. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  75. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  76. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  77. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  78. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  79. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  80. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  81. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  82. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  83. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  84. return args;
  85. }
  86. void benchmark_target_algo(
  87. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  88. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  89. const char* algo = nullptr,
  90. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  91. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  92. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  93. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  94. size_t RUNS = 1000;
  95. benchmarker.set_display(false).set_times(RUNS);
  96. benchmarker_cudnn.set_display(false).set_times(RUNS);
  97. if (algo) {
  98. benchmarker.set_before_exec_callback(
  99. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo));
  100. }
  101. #define CUDNN_VERSION_STRING \
  102. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  103. benchmarker_cudnn.set_before_exec_callback(
  104. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  105. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  106. "ALGO_IMPLICIT_PRECOMP_"
  107. "GEMM" CUDNN_VERSION_STRING));
  108. benchmarker.set_dtype(0, src_dtype)
  109. .set_dtype(1, filter_dtype)
  110. .set_dtype(2, bias_dtype)
  111. .set_dtype(3, dst_dtype)
  112. .set_dtype(4, dst_dtype);
  113. benchmarker_cudnn.set_dtype(0, src_dtype)
  114. .set_dtype(1, filter_dtype)
  115. .set_dtype(2, bias_dtype)
  116. .set_dtype(3, dst_dtype)
  117. .set_dtype(4, dst_dtype);
  118. using Param = ConvBias::Param;
  119. using Format = Param::Format;
  120. if (format == Format::NCHW4) {
  121. for (auto&& arg : args) {
  122. Param param;
  123. param.pad_h = param.pad_w = arg.f / 2;
  124. param.stride_h = param.stride_w = arg.s;
  125. param.format = Format::NCHW4;
  126. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  127. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  128. benchmarker.set_param(param);
  129. auto time_in_ms =
  130. benchmarker.execs({{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  131. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  132. {1, arg.co / 4, 1, 1, 4},
  133. {},
  134. {}}) /
  135. RUNS;
  136. param.nonlineMode = Param::NonlineMode::IDENTITY;
  137. benchmarker_cudnn.set_param(param);
  138. auto time_in_ms_cudnn =
  139. benchmarker_cudnn.execs(
  140. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  141. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  142. {1, arg.co / 4, 1, 1, 4},
  143. {},
  144. {}}) /
  145. RUNS;
  146. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f *
  147. arg.f / (1e12);
  148. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  149. filter{arg.co, arg.ci, arg.f, arg.f};
  150. printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
  151. "time(cudnn)=%.2f %.2fTops, "
  152. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  153. src.to_string().c_str(), filter.to_string().c_str(), algo,
  154. time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  155. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  156. time_in_ms_cudnn / time_in_ms);
  157. }
  158. printf("bench with z tensor\n");
  159. for (auto&& arg : args) {
  160. Param param;
  161. param.pad_h = param.pad_w = arg.f / 2;
  162. param.stride_h = param.stride_w = arg.s;
  163. param.format = Format::NCHW4;
  164. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  165. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  166. benchmarker.set_param(param);
  167. auto time_in_ms =
  168. benchmarker.execs({{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  169. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  170. {1, arg.co / 4, 1, 1, 4},
  171. {arg.n, arg.co / 4, ho, wo, 4},
  172. {}}) /
  173. RUNS;
  174. param.format = Format::NCHW4;
  175. param.nonlineMode = Param::NonlineMode::IDENTITY;
  176. benchmarker_cudnn.set_param(param);
  177. auto time_in_ms_cudnn =
  178. benchmarker_cudnn.execs(
  179. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  180. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  181. {1, arg.co / 4, 1, 1, 4},
  182. {arg.n, arg.co / 4, ho, wo, 4},
  183. {}}) /
  184. RUNS;
  185. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f *
  186. arg.f / (1e12);
  187. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  188. filter{arg.co, arg.ci, arg.f, arg.f};
  189. printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
  190. "time(cudnn)=%.2f %.2fTops, "
  191. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  192. src.to_string().c_str(), filter.to_string().c_str(), algo,
  193. time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  194. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  195. time_in_ms_cudnn / time_in_ms);
  196. }
  197. } else if (format == Format::CHWN4) {
  198. for (auto&& arg : args) {
  199. Param param;
  200. param.pad_h = param.pad_w = arg.f / 2;
  201. param.stride_h = param.stride_w = arg.s;
  202. param.format = Format::CHWN4;
  203. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  204. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  205. benchmarker.set_param(param);
  206. auto time_in_ms =
  207. benchmarker.execs({{arg.ci / 4, arg.hi, arg.wi, arg.n, 4},
  208. {arg.ci / 4, arg.f, arg.f, arg.co, 4},
  209. {arg.co / 4, 1, 1, 1, 4},
  210. {},
  211. {}}) /
  212. RUNS;
  213. param.format = Format::NCHW4;
  214. benchmarker_cudnn.set_param(param);
  215. auto time_in_ms_cudnn =
  216. benchmarker_cudnn.execs(
  217. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  218. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  219. {1, arg.co / 4, 1, 1, 4},
  220. {},
  221. {}}) /
  222. RUNS;
  223. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f *
  224. arg.f / (1e12);
  225. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  226. filter{arg.co, arg.ci, arg.f, arg.f};
  227. printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
  228. "time(cudnn)=%.2f %.2fTops, "
  229. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  230. src.to_string().c_str(), filter.to_string().c_str(), algo,
  231. time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  232. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  233. time_in_ms_cudnn / time_in_ms);
  234. }
  235. printf("bench with z tensor\n");
  236. for (auto&& arg : args) {
  237. Param param;
  238. param.pad_h = param.pad_w = arg.f / 2;
  239. param.stride_h = param.stride_w = arg.s;
  240. param.format = Format::CHWN4;
  241. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  242. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  243. benchmarker.set_param(param);
  244. auto time_in_ms =
  245. benchmarker.execs({{arg.ci / 4, arg.hi, arg.wi, arg.n, 4},
  246. {arg.ci / 4, arg.f, arg.f, arg.co, 4},
  247. {arg.co / 4, 1, 1, 1, 4},
  248. {arg.co / 4, ho, wo, arg.n, 4},
  249. {}}) /
  250. RUNS;
  251. param.format = Format::NCHW4;
  252. benchmarker_cudnn.set_param(param);
  253. param.nonlineMode = Param::NonlineMode::IDENTITY;
  254. auto time_in_ms_cudnn =
  255. benchmarker_cudnn.execs(
  256. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  257. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  258. {1, arg.co / 4, 1, 1, 4},
  259. {arg.n, arg.co / 4, ho, wo, 4},
  260. {}}) /
  261. RUNS;
  262. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f *
  263. arg.f / (1e12);
  264. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  265. filter{arg.co, arg.ci, arg.f, arg.f};
  266. printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
  267. "time(cudnn)=%.2f %.2fTops, "
  268. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  269. src.to_string().c_str(), filter.to_string().c_str(), algo,
  270. time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  271. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  272. time_in_ms_cudnn / time_in_ms);
  273. }
  274. }
  275. }
  276. void benchmark_target_algo_with_cudnn_tsc(
  277. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  278. DType filter_dtype, DType bias_dtype, DType dst_dtype,
  279. const char* algo = nullptr,
  280. param::ConvBias::Format format = param::ConvBias::Format::NCHW4) {
  281. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  282. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  283. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  284. size_t RUNS = 1000;
  285. benchmarker.set_display(false).set_times(RUNS);
  286. benchmarker_cudnn.set_display(false).set_times(RUNS);
  287. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  288. new OprProxy<ConvBiasForward>{true}};
  289. if (algo) {
  290. benchmarker.set_before_exec_callback(
  291. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo));
  292. } else {
  293. benchmarker.set_proxy(proxy);
  294. }
  295. benchmarker_cudnn.set_before_exec_callback(
  296. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  297. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  298. "ALGO_IMPLICIT_PRECOMP_"
  299. "GEMM" CUDNN_VERSION_STRING));
  300. #undef CUDNN_VERSION_STRING
  301. benchmarker.set_dtype(0, src_dtype)
  302. .set_dtype(1, filter_dtype)
  303. .set_dtype(2, bias_dtype)
  304. .set_dtype(3, dst_dtype)
  305. .set_dtype(4, dst_dtype);
  306. benchmarker_cudnn.set_dtype(0, src_dtype)
  307. .set_dtype(1, filter_dtype)
  308. .set_dtype(2, bias_dtype)
  309. .set_dtype(3, dst_dtype)
  310. .set_dtype(4, dst_dtype);
  311. using Param = ConvBias::Param;
  312. using Format = Param::Format;
  313. if (format == Format::NCHW4) {
  314. for (auto&& arg : args) {
  315. Param param;
  316. param.pad_h = param.pad_w = arg.f / 2;
  317. param.stride_h = param.stride_w = arg.s;
  318. param.format = Format::NCHW4;
  319. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  320. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  321. benchmarker.set_param(param);
  322. if (!algo) {
  323. benchmarker.proxy()->target_algo = nullptr;
  324. }
  325. auto time_in_ms =
  326. benchmarker.execs({{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  327. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  328. {1, arg.co / 4, 1, 1, 4},
  329. {},
  330. {}}) /
  331. RUNS;
  332. param.format = Format::NCHW32;
  333. benchmarker_cudnn.set_param(param);
  334. auto time_in_ms_cudnn =
  335. benchmarker_cudnn.execs(
  336. {{arg.n, arg.ci / 32, arg.hi, arg.wi, 32},
  337. {arg.co, arg.ci / 32, arg.f, arg.f, 32},
  338. {1, arg.co / 32, 1, 1, 32},
  339. {},
  340. {}}) /
  341. RUNS;
  342. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f *
  343. arg.f / (1e12);
  344. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  345. filter{arg.co, arg.ci, arg.f, arg.f};
  346. printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
  347. "time(cudnn)=%.2f %.2fTops, "
  348. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  349. src.to_string().c_str(), filter.to_string().c_str(), algo,
  350. time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  351. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  352. time_in_ms_cudnn / time_in_ms);
  353. }
  354. } else if (format == Format::CHWN4) {
  355. for (auto&& arg : args) {
  356. Param param;
  357. param.pad_h = param.pad_w = arg.f / 2;
  358. param.stride_h = param.stride_w = arg.s;
  359. param.format = Format::CHWN4;
  360. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  361. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  362. benchmarker.set_param(param);
  363. if (!algo) {
  364. benchmarker.proxy()->target_algo = nullptr;
  365. }
  366. auto time_in_ms =
  367. benchmarker.execs({{arg.ci / 4, arg.hi, arg.wi, arg.n, 4},
  368. {arg.ci / 4, arg.f, arg.f, arg.co, 4},
  369. {arg.co / 4, 1, 1, 1, 4},
  370. {},
  371. {}}) /
  372. RUNS;
  373. float time_in_ms_cudnn = 0.f;
  374. if (arg.ci % 32 == 0 && arg.co % 32 == 0) {
  375. param.format = Format::NCHW32;
  376. benchmarker_cudnn.set_param(param);
  377. time_in_ms_cudnn =
  378. benchmarker_cudnn.execs(
  379. {{arg.n, arg.ci / 32, arg.hi, arg.wi, 32},
  380. {arg.co, arg.ci / 32, arg.f, arg.f, 32},
  381. {1, arg.co / 32, 1, 1, 32},
  382. {},
  383. {}}) /
  384. RUNS;
  385. } else {
  386. param.format = Format::NCHW4;
  387. benchmarker_cudnn.set_param(param);
  388. time_in_ms_cudnn =
  389. benchmarker_cudnn.execs(
  390. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  391. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  392. {1, arg.co / 4, 1, 1, 4},
  393. {},
  394. {}}) /
  395. RUNS;
  396. }
  397. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f *
  398. arg.f / (1e12);
  399. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  400. filter{arg.co, arg.ci, arg.f, arg.f};
  401. printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
  402. "time(cudnn)=%.2f %.2fTops, "
  403. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  404. src.to_string().c_str(), filter.to_string().c_str(), algo,
  405. time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  406. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  407. time_in_ms_cudnn / time_in_ms);
  408. }
  409. printf("bench with z tensor\n");
  410. for (auto&& arg : args) {
  411. Param param;
  412. param.pad_h = param.pad_w = arg.f / 2;
  413. param.stride_h = param.stride_w = arg.s;
  414. param.format = Format::CHWN4;
  415. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  416. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  417. benchmarker.set_param(param);
  418. if (!algo) {
  419. benchmarker.proxy()->target_algo = nullptr;
  420. }
  421. auto time_in_ms =
  422. benchmarker.execs({{arg.ci / 4, arg.hi, arg.wi, arg.n, 4},
  423. {arg.ci / 4, arg.f, arg.f, arg.co, 4},
  424. {arg.co / 4, 1, 1, 1, 4},
  425. {arg.co / 4, ho, wo, arg.n, 4},
  426. {}}) /
  427. RUNS;
  428. float time_in_ms_cudnn = 0.f;
  429. if (arg.ci % 32 == 0 && arg.co % 32 == 0) {
  430. param.format = Format::NCHW32;
  431. benchmarker_cudnn.set_param(param);
  432. time_in_ms_cudnn =
  433. benchmarker_cudnn.execs(
  434. {{arg.n, arg.ci / 32, arg.hi, arg.wi, 32},
  435. {arg.co, arg.ci / 32, arg.f, arg.f, 32},
  436. {1, arg.co / 32, 1, 1, 32},
  437. {arg.n, arg.co / 32, ho, wo, 32},
  438. {}}) /
  439. RUNS;
  440. } else {
  441. param.format = Format::NCHW4;
  442. benchmarker_cudnn.set_param(param);
  443. time_in_ms_cudnn =
  444. benchmarker_cudnn.execs(
  445. {{arg.n, arg.ci / 4, arg.hi, arg.wi, 4},
  446. {arg.co, arg.ci / 4, arg.f, arg.f, 4},
  447. {1, arg.co / 4, 1, 1, 4},
  448. {arg.n, arg.co / 4, ho, wo, 4},
  449. {}}) /
  450. RUNS;
  451. }
  452. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f *
  453. arg.f / (1e12);
  454. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  455. filter{arg.co, arg.ci, arg.f, arg.f};
  456. printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
  457. "time(cudnn)=%.2f %.2fTops, "
  458. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  459. src.to_string().c_str(), filter.to_string().c_str(), algo,
  460. time_in_ms, (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  461. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  462. time_in_ms_cudnn / time_in_ms);
  463. }
  464. }
  465. }
  466. #endif
  467. } // namespace
  468. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_1x1) {
  469. require_compute_capability(6, 1);
  470. conv_bias::check_conv_bias(
  471. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  472. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  473. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  474. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(1));
  475. }
  476. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_3x3) {
  477. require_compute_capability(6, 1);
  478. conv_bias::check_conv_bias(
  479. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  480. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  481. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  482. param::ConvBias::Format::NCHW4);
  483. }
  484. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_5x5) {
  485. require_compute_capability(6, 1);
  486. conv_bias::check_conv_bias(
  487. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  488. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  489. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  490. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(5));
  491. }
  492. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_7x7) {
  493. require_compute_capability(6, 1);
  494. conv_bias::check_conv_bias(
  495. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  496. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  497. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  498. param::ConvBias::Format::NCHW4, conv_bias::get_int8_nchw4_args(7));
  499. }
  500. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_WITH_Z) {
  501. require_compute_capability(6, 1);
  502. Checker<ConvBiasForward> checker(handle_cuda());
  503. checker.set_before_exec_callback(
  504. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  505. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  506. UniformIntRNG rng{-3, 3};
  507. UniformIntRNG bias_rng{-50, 50};
  508. checker.set_rng(0, &rng)
  509. .set_rng(1, &rng)
  510. .set_rng(2, &bias_rng)
  511. .set_rng(3, &rng)
  512. .set_dtype(0, dtype::QuantizedS8{1.2f})
  513. .set_dtype(1, dtype::QuantizedS8{1.3f})
  514. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  515. .set_dtype(3, dtype::QuantizedS8{1.1f})
  516. .set_dtype(4, dtype::QuantizedS8{1.0f})
  517. .set_epsilon(1 + 1e-3)
  518. .set_max_avg_error(1e-1)
  519. .set_max_avg_biased_error(1e-1);
  520. param::ConvBias param;
  521. param.pad_h = param.pad_w = 1;
  522. param.stride_h = param.stride_w = 1;
  523. param.format = param::ConvBias::Format::NCHW4;
  524. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  525. {16, 4, 3, 3, 4},
  526. {1, 4, 1, 1, 4},
  527. {32, 4, 12, 12, 4},
  528. {}});
  529. }
  530. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_STRIDE2_WITH_Z) {
  531. require_compute_capability(6, 1);
  532. Checker<ConvBiasForward> checker(handle_cuda());
  533. checker.set_before_exec_callback(
  534. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  535. "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"));
  536. UniformIntRNG rng{-3, 3};
  537. UniformIntRNG bias_rng{-50, 50};
  538. checker.set_rng(0, &rng)
  539. .set_rng(1, &rng)
  540. .set_rng(2, &bias_rng)
  541. .set_rng(3, &rng)
  542. .set_dtype(0, dtype::QuantizedS8{1.2f})
  543. .set_dtype(1, dtype::QuantizedS8{1.3f})
  544. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  545. .set_dtype(3, dtype::QuantizedS8{1.1f})
  546. .set_dtype(4, dtype::QuantizedS8{1.0f})
  547. .set_epsilon(1 + 1e-3)
  548. .set_max_avg_error(1e-1)
  549. .set_max_avg_biased_error(1e-1);
  550. param::ConvBias param;
  551. param.pad_h = param.pad_w = 1;
  552. param.stride_h = param.stride_w = 2;
  553. param.format = param::ConvBias::Format::NCHW4;
  554. checker.set_param(param).execs({{32, 4, 12, 12, 4},
  555. {16, 4, 3, 3, 4},
  556. {1, 4, 1, 1, 4},
  557. {32, 4, 6, 6, 4},
  558. {}});
  559. }
  560. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_1x1) {
  561. require_compute_capability(6, 1);
  562. conv_bias::check_conv_bias(
  563. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  564. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  565. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  566. param::ConvBias::Format::NCHW4,
  567. conv_bias::get_int8_nchw4_args_check_bounds(1));
  568. }
  569. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_3x3) {
  570. require_compute_capability(6, 1);
  571. conv_bias::check_conv_bias(
  572. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  573. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  574. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  575. param::ConvBias::Format::NCHW4,
  576. conv_bias::get_int8_nchw4_args_check_bounds(3));
  577. }
  578. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_5x5) {
  579. require_compute_capability(6, 1);
  580. conv_bias::check_conv_bias(
  581. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  582. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  583. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  584. param::ConvBias::Format::NCHW4,
  585. conv_bias::get_int8_nchw4_args_check_bounds(5));
  586. }
  587. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_CHECK_BOUNDS_7x7) {
  588. require_compute_capability(6, 1);
  589. conv_bias::check_conv_bias(
  590. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  591. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  592. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  593. param::ConvBias::Format::NCHW4,
  594. conv_bias::get_int8_nchw4_args_check_bounds(7));
  595. }
  596. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4) {
  597. require_compute_capability(6, 1);
  598. conv_bias::check_conv_bias(
  599. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  600. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  601. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  602. param::ConvBias::Format::CHWN4);
  603. }
  604. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_WITH_Z) {
  605. require_compute_capability(6, 1);
  606. Checker<ConvBiasForward> checker(handle_cuda());
  607. checker.set_before_exec_callback(
  608. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  609. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  610. UniformIntRNG rng{-3, 3};
  611. UniformIntRNG bias_rng{-50, 50};
  612. checker.set_rng(0, &rng)
  613. .set_rng(1, &rng)
  614. .set_rng(2, &bias_rng)
  615. .set_rng(3, &rng)
  616. .set_dtype(0, dtype::QuantizedS8{1.2f})
  617. .set_dtype(1, dtype::QuantizedS8{1.3f})
  618. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  619. .set_dtype(3, dtype::QuantizedS8{1.1f})
  620. .set_dtype(4, dtype::QuantizedS8{1.1f})
  621. .set_epsilon(1 + 1e-3)
  622. .set_max_avg_error(1e-1)
  623. .set_max_avg_biased_error(1e-1);
  624. param::ConvBias param;
  625. param.pad_h = param.pad_w = 1;
  626. param.stride_h = param.stride_w = 1;
  627. param.format = param::ConvBias::Format::CHWN4;
  628. checker.set_param(param).execs({{4, 12, 12, 32, 4},
  629. {4, 3, 3, 16, 4},
  630. {4, 1, 1, 1, 4},
  631. {4, 12, 12, 32, 4},
  632. {}});
  633. }
  634. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_HSWISH) {
  635. require_compute_capability(6, 1);
  636. Checker<ConvBiasForward> checker(handle_cuda());
  637. checker.set_before_exec_callback(
  638. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  639. "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM"));
  640. UniformIntRNG rng{-3, 3};
  641. UniformIntRNG bias_rng{-50, 50};
  642. checker.set_rng(0, &rng)
  643. .set_rng(1, &rng)
  644. .set_rng(2, &bias_rng)
  645. .set_rng(3, &rng)
  646. .set_dtype(0, dtype::QuantizedS8{1.2f})
  647. .set_dtype(1, dtype::QuantizedS8{1.3f})
  648. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  649. .set_dtype(4, dtype::QuantizedS8{0.001f})
  650. .set_epsilon(1 + 1e-3)
  651. .set_max_avg_error(1e-1)
  652. .set_max_avg_biased_error(1e-1);
  653. param::ConvBias param;
  654. param.pad_h = param.pad_w = 1;
  655. param.stride_h = param.stride_w = 1;
  656. param.format = param::ConvBias::Format::CHWN4;
  657. param.nonlineMode = param::ConvBias::NonlineMode::H_SWISH;
  658. checker.set_param(param).execs({{4, 12, 12, 32, 4},
  659. {4, 3, 3, 16, 4},
  660. {4, 1, 1, 1, 4},
  661. {},
  662. {}});
  663. }
  664. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_CHECK_BOUNDS) {
  665. require_compute_capability(6, 1);
  666. conv_bias::check_conv_bias(
  667. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  668. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  669. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  670. param::ConvBias::Format::CHWN4,
  671. conv_bias::get_int8_chwn4_args_check_bounds(3));
  672. }
  673. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1) {
  674. require_compute_capability(6, 1);
  675. conv_bias::check_conv_bias(
  676. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  677. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  678. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  679. param::ConvBias::Format::CHWN4,
  680. conv_bias::get_int8_chwn4_small_channel_args(1));
  681. }
  682. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_3x3) {
  683. require_compute_capability(6, 1);
  684. conv_bias::check_conv_bias(
  685. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  686. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  687. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  688. param::ConvBias::Format::CHWN4,
  689. conv_bias::get_int8_chwn4_small_channel_args(3));
  690. }
  691. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5) {
  692. require_compute_capability(6, 1);
  693. conv_bias::check_conv_bias(
  694. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  695. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  696. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  697. param::ConvBias::Format::CHWN4,
  698. conv_bias::get_int8_chwn4_small_channel_args(5));
  699. }
  700. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7) {
  701. require_compute_capability(6, 1);
  702. conv_bias::check_conv_bias(
  703. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  704. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  705. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  706. param::ConvBias::Format::CHWN4,
  707. conv_bias::get_int8_chwn4_small_channel_args(7));
  708. }
  709. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_SMALL_CHANNEL_CHECK_BOUNDS) {
  710. require_compute_capability(6, 1);
  711. conv_bias::check_conv_bias(
  712. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  713. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  714. handle_cuda(), "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  715. param::ConvBias::Format::NCHW4,
  716. conv_bias::get_int8_nchw4_small_channel_args_check_bounds(3));
  717. }
  718. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_1x1_CHECK_BOUNDS) {
  719. require_compute_capability(6, 1);
  720. conv_bias::check_conv_bias(
  721. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  722. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  723. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  724. param::ConvBias::Format::CHWN4,
  725. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(1));
  726. }
  727. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_5x5_CHECK_BOUNDS) {
  728. require_compute_capability(6, 1);
  729. conv_bias::check_conv_bias(
  730. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  731. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  732. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  733. param::ConvBias::Format::CHWN4,
  734. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(5));
  735. }
  736. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL_7x7_CHECK_BOUNDS) {
  737. require_compute_capability(6, 1);
  738. conv_bias::check_conv_bias(
  739. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  740. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  741. handle_cuda(), "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  742. param::ConvBias::Format::CHWN4,
  743. conv_bias::get_int8_chwn4_small_channel_args_check_bounds(7));
  744. }
  745. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_1x1) {
  746. require_compute_capability(7, 5);
  747. conv_bias::check_conv_bias(
  748. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  749. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  750. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  751. param::ConvBias::Format::NCHW4,
  752. conv_bias::get_int8_nchw4_tensorcore_args(1));
  753. }
  754. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_3x3) {
  755. require_compute_capability(7, 5);
  756. conv_bias::check_conv_bias(
  757. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  758. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  759. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  760. param::ConvBias::Format::NCHW4,
  761. conv_bias::get_int8_nchw4_tensorcore_args(3));
  762. }
  763. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_5x5) {
  764. require_compute_capability(7, 5);
  765. conv_bias::check_conv_bias(
  766. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  767. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  768. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  769. param::ConvBias::Format::NCHW4,
  770. conv_bias::get_int8_nchw4_tensorcore_args(5));
  771. }
  772. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_7x7) {
  773. require_compute_capability(7, 5);
  774. conv_bias::check_conv_bias(
  775. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  776. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  777. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  778. param::ConvBias::Format::NCHW4,
  779. conv_bias::get_int8_nchw4_tensorcore_args(7));
  780. }
  781. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  782. require_compute_capability(7, 5);
  783. conv_bias::check_conv_bias(
  784. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  785. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  786. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  787. param::ConvBias::Format::NCHW4,
  788. conv_bias::get_int8_nchw4_args_check_bounds(3));
  789. }
  790. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  791. require_compute_capability(7, 5);
  792. conv_bias::check_conv_bias(
  793. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  794. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  795. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  796. param::ConvBias::Format::NCHW4,
  797. conv_bias::get_int8_nchw4_args_check_bounds(3));
  798. }
  799. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  800. require_compute_capability(7, 5);
  801. conv_bias::check_conv_bias(
  802. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  803. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  804. handle_cuda(), "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  805. param::ConvBias::Format::NCHW4,
  806. conv_bias::get_int8_nchw4_args_check_bounds(3));
  807. }
  808. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_0) {
  809. require_compute_capability(7, 5);
  810. conv_bias::check_conv_bias(
  811. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  812. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  813. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  814. param::ConvBias::Format::CHWN4,
  815. conv_bias::get_int8_chwn4_tensorcore_args(3));
  816. }
  817. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_1) {
  818. require_compute_capability(7, 5);
  819. conv_bias::check_conv_bias(
  820. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  821. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  822. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma32x8x16",
  823. param::ConvBias::Format::CHWN4,
  824. conv_bias::get_int8_chwn4_tensorcore_args(3));
  825. }
  826. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_ALGO_2) {
  827. require_compute_capability(7, 5);
  828. conv_bias::check_conv_bias(
  829. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  830. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  831. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma8x32x16",
  832. param::ConvBias::Format::CHWN4,
  833. conv_bias::get_int8_chwn4_tensorcore_args(3));
  834. }
  835. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_1x1) {
  836. require_compute_capability(7, 5);
  837. conv_bias::check_conv_bias(
  838. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  839. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  840. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  841. param::ConvBias::Format::CHWN4,
  842. conv_bias::get_int8_chwn4_args_check_bounds(1));
  843. }
  844. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_5x5) {
  845. require_compute_capability(7, 5);
  846. conv_bias::check_conv_bias(
  847. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  848. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  849. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  850. param::ConvBias::Format::CHWN4,
  851. conv_bias::get_int8_chwn4_args_check_bounds(5));
  852. }
  853. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_CHECK_BOUNDS_7x7) {
  854. require_compute_capability(7, 5);
  855. conv_bias::check_conv_bias(
  856. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  857. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  858. handle_cuda(), "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  859. param::ConvBias::Format::CHWN4,
  860. conv_bias::get_int8_chwn4_args_check_bounds(7));
  861. }
  862. TEST_F(CUDA, CONV_BIAS_INT8_NCHW4_TENSORCORE_WITH_Z) {
  863. require_compute_capability(7, 5);
  864. Checker<ConvBiasForward> checker(handle_cuda());
  865. checker.set_before_exec_callback(
  866. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  867. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  868. UniformIntRNG rng{-3, 3};
  869. UniformIntRNG bias_rng{-50, 50};
  870. checker.set_rng(0, &rng)
  871. .set_rng(1, &rng)
  872. .set_rng(2, &bias_rng)
  873. .set_rng(3, &rng)
  874. .set_dtype(0, dtype::QuantizedS8{1.2f})
  875. .set_dtype(1, dtype::QuantizedS8{1.3f})
  876. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  877. .set_dtype(3, dtype::QuantizedS8{1.1f})
  878. .set_dtype(4, dtype::QuantizedS8{1.0f})
  879. .set_epsilon(1 + 1e-3)
  880. .set_max_avg_error(1e-1)
  881. .set_max_avg_biased_error(1e-1);
  882. param::ConvBias param;
  883. param.pad_h = param.pad_w = 1;
  884. param.stride_h = param.stride_w = 1;
  885. param.format = param::ConvBias::Format::NCHW4;
  886. checker.set_param(param).execs({{64, 8, 12, 12, 4},
  887. {64, 8, 3, 3, 4},
  888. {1, 16, 1, 1, 4},
  889. {64, 16, 12, 12, 4},
  890. {}});
  891. }
  892. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_TENSORCORE_WITH_Z) {
  893. require_compute_capability(7, 5);
  894. Checker<ConvBiasForward> checker(handle_cuda());
  895. checker.set_before_exec_callback(
  896. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  897. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16"));
  898. UniformIntRNG rng{-3, 3};
  899. UniformIntRNG bias_rng{-50, 50};
  900. checker.set_rng(0, &rng)
  901. .set_rng(1, &rng)
  902. .set_rng(2, &bias_rng)
  903. .set_rng(3, &rng)
  904. .set_dtype(0, dtype::QuantizedS8{1.2f})
  905. .set_dtype(1, dtype::QuantizedS8{1.3f})
  906. .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
  907. .set_dtype(3, dtype::QuantizedS8{1.1f})
  908. .set_dtype(4, dtype::QuantizedS8{1.0f})
  909. .set_epsilon(1 + 1e-3)
  910. .set_max_avg_error(1e-1)
  911. .set_max_avg_biased_error(1e-1);
  912. param::ConvBias param;
  913. param.pad_h = param.pad_w = 1;
  914. param.stride_h = param.stride_w = 1;
  915. param.format = param::ConvBias::Format::CHWN4;
  916. checker.set_param(param).execs({{8, 12, 12, 64, 4},
  917. {8, 3, 3, 64, 4},
  918. {16, 1, 1, 1, 4},
  919. {16, 12, 12, 64, 4},
  920. {}});
  921. }
  922. TEST_F(CUDA,
  923. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_0) {
  924. require_compute_capability(7, 5);
  925. conv_bias::check_conv_bias(
  926. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  927. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  928. handle_cuda(),
  929. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  930. param::ConvBias::Format::CHWN4,
  931. conv_bias::get_int8_chwn4_args_check_bounds(3));
  932. }
  933. TEST_F(CUDA,
  934. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_1) {
  935. require_compute_capability(7, 5);
  936. conv_bias::check_conv_bias(
  937. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  938. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  939. handle_cuda(),
  940. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  941. param::ConvBias::Format::CHWN4,
  942. conv_bias::get_int8_chwn4_args_check_bounds(3));
  943. }
  944. TEST_F(CUDA,
  945. CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_CHECK_BOUNDS_ALGO_2) {
  946. require_compute_capability(7, 5);
  947. conv_bias::check_conv_bias(
  948. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  949. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  950. handle_cuda(),
  951. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  952. param::ConvBias::Format::CHWN4,
  953. conv_bias::get_int8_chwn4_args_check_bounds(3));
  954. }
  955. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_0) {
  956. require_compute_capability(7, 5);
  957. conv_bias::check_conv_bias(
  958. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  959. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  960. handle_cuda(),
  961. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma16x16x16",
  962. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  963. }
  964. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_1) {
  965. require_compute_capability(7, 5);
  966. conv_bias::check_conv_bias(
  967. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  968. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  969. handle_cuda(),
  970. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma8x32x16",
  971. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  972. }
  973. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_REFORMAT_FILTER_TENSORCORE_ALGO_2) {
  974. require_compute_capability(7, 5);
  975. conv_bias::check_conv_bias(
  976. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  977. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  978. handle_cuda(),
  979. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_mma32x8x16",
  980. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  981. }
  982. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_0) {
  983. require_compute_capability(7, 5);
  984. conv_bias::check_conv_bias(
  985. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  986. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  987. handle_cuda(),
  988. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  989. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  990. }
  991. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_1) {
  992. require_compute_capability(7, 5);
  993. conv_bias::check_conv_bias(
  994. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  995. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  996. handle_cuda(),
  997. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  998. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  999. }
  1000. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_ALGO_2) {
  1001. require_compute_capability(7, 5);
  1002. conv_bias::check_conv_bias(
  1003. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1004. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.3f},
  1005. handle_cuda(),
  1006. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  1007. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(3));
  1008. }
  1009. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1) {
  1010. require_compute_capability(7, 5);
  1011. conv_bias::check_conv_bias(
  1012. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1013. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  1014. handle_cuda(),
  1015. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  1016. param::ConvBias::Format::CHWN4, conv_bias::get_int8_chwn4_args(1));
  1017. }
  1018. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5) {
  1019. require_compute_capability(7, 5);
  1020. conv_bias::check_conv_bias(
  1021. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1022. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  1023. handle_cuda(),
  1024. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  1025. param::ConvBias::Format::CHWN4,
  1026. conv_bias::get_int8_chwn4_args_small_batch(5));
  1027. }
  1028. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_7x7) {
  1029. require_compute_capability(7, 5);
  1030. conv_bias::check_conv_bias(
  1031. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1032. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  1033. handle_cuda(),
  1034. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma16x16x16",
  1035. param::ConvBias::Format::CHWN4,
  1036. conv_bias::get_int8_chwn4_args_small_batch(7));
  1037. }
  1038. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_1) {
  1039. require_compute_capability(7, 5);
  1040. conv_bias::check_conv_bias(
  1041. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1042. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  1043. handle_cuda(),
  1044. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  1045. param::ConvBias::Format::CHWN4,
  1046. conv_bias::get_int8_chwn4_args_small_batch(5));
  1047. }
  1048. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_5x5_ALGO_2) {
  1049. require_compute_capability(7, 5);
  1050. conv_bias::check_conv_bias(
  1051. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1052. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  1053. handle_cuda(),
  1054. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  1055. param::ConvBias::Format::CHWN4,
  1056. conv_bias::get_int8_chwn4_args_small_batch(5));
  1057. }
  1058. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_1) {
  1059. require_compute_capability(7, 5);
  1060. conv_bias::check_conv_bias(
  1061. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1062. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  1063. handle_cuda(),
  1064. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma32x8x16",
  1065. param::ConvBias::Format::CHWN4,
  1066. conv_bias::get_int8_chwn4_args_small_batch(1));
  1067. }
  1068. TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
  1069. require_compute_capability(7, 5);
  1070. conv_bias::check_conv_bias(
  1071. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1072. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.1f},
  1073. handle_cuda(),
  1074. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_mma8x32x16",
  1075. param::ConvBias::Format::CHWN4,
  1076. conv_bias::get_int8_chwn4_args_small_batch(1));
  1077. }
  1078. #if MEGDNN_WITH_BENCHMARK
  1079. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) {
  1080. require_compute_capability(6, 1);
  1081. benchmark_target_algo(
  1082. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1083. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1084. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1085. param::ConvBias::Format::CHWN4);
  1086. }
  1087. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4) {
  1088. require_compute_capability(6, 1);
  1089. benchmark_target_algo(
  1090. handle_cuda(), get_resnet50_bench_args(), dtype::QuantizedS8{1.2f},
  1091. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1092. dtype::QuantizedS8{1.0f}, "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM",
  1093. param::ConvBias::Format::NCHW4);
  1094. }
  1095. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE) {
  1096. require_compute_capability(7, 5);
  1097. benchmark_target_algo_with_cudnn_tsc(
  1098. handle_cuda(), get_resnet50_bench_args(256),
  1099. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1100. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1101. "INT8_CHWN4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1102. param::ConvBias::Format::CHWN4);
  1103. }
  1104. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_TENSORCORE_ALL_ALGO) {
  1105. require_compute_capability(7, 5);
  1106. benchmark_target_algo_with_cudnn_tsc(
  1107. handle_cuda(), get_resnet50_bench_args(256),
  1108. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1109. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f}, nullptr,
  1110. param::ConvBias::Format::CHWN4);
  1111. }
  1112. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_DET_ALL_ALGO) {
  1113. require_compute_capability(7, 5);
  1114. benchmark_target_algo_with_cudnn_tsc(
  1115. handle_cuda(), get_detection_bench_args(), dtype::QuantizedS8{1.2f},
  1116. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1117. dtype::QuantizedS8{1.0f}, nullptr, param::ConvBias::Format::CHWN4);
  1118. }
  1119. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_NCHW4_TENSORCORE) {
  1120. require_compute_capability(7, 5);
  1121. benchmark_target_algo_with_cudnn_tsc(
  1122. handle_cuda(), get_resnet50_bench_args(256),
  1123. dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
  1124. dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
  1125. "INT8_NCHW4_IMMA_IMPLICIT_GEMM_mma16x16x16",
  1126. param::ConvBias::Format::NCHW4);
  1127. }
  1128. TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
  1129. require_compute_capability(6, 1);
  1130. std::vector<BenchArgs> args;
  1131. args.push_back(BenchArgs{64, 4, 224, 224, 64, 7, 2});
  1132. benchmark_target_algo(
  1133. handle_cuda(), args, dtype::QuantizedS8{1.2f},
  1134. dtype::QuantizedS8{1.3f}, dtype::QuantizedS32{1.2f * 1.3f},
  1135. dtype::QuantizedS8{1.0f}, "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM",
  1136. param::ConvBias::Format::CHWN4);
  1137. }
  1138. #endif
  1139. } // namespace test
  1140. } // namespace megdnn
  1141. #undef V1
  1142. #undef V
  1143. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台