You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_test_utils.cpp 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. /**
  2. * \file dnn/test/cuda/conv_test_utils.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/oprs/nn.h"
  13. #include "src/common/utils.h"
  14. #include "src/cuda/cudnn_with_check.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/conv_bias.h"
  17. #include "test/common/tensor.h"
  18. #include "test/common/workspace_wrapper.h"
  19. #include "test/cuda/benchmark.h"
  20. #include "test/cuda/conv_test_utils.h"
  21. #include "test/cuda/fixture.h"
  22. #include "test/cuda/utils.h"
  23. #define V1(x) #x
  24. #define V(x) V1(x)
  25. namespace megdnn {
  26. namespace test {
  27. namespace conv {
  28. #if MEGDNN_WITH_BENCHMARK
  29. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch) {
  30. std::vector<BenchArgs> args;
  31. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  32. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  33. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  34. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  35. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  36. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  37. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  38. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  39. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  40. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  41. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  42. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  43. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  44. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  45. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  46. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  47. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  48. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  49. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  50. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 2});
  51. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  52. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  53. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  54. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  55. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  56. return args;
  57. }
  58. std::vector<BenchArgs> get_detection_bench_args(size_t batch) {
  59. std::vector<BenchArgs> args;
  60. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  61. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  62. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  63. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  64. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  66. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  67. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  68. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  69. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  70. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  71. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  72. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  73. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  74. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  75. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  76. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  77. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  78. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  79. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  80. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  81. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  82. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  83. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  84. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  85. return args;
  86. }
  87. std::vector<BenchArgs> get_det_first_bench_args(size_t batch) {
  88. std::vector<BenchArgs> args;
  89. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 16, 3, 2});
  90. args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1});
  91. args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2});
  92. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1});
  93. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1});
  94. return args;
  95. }
  96. void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args,
  97. DType src_dtype, DType filter_dtype,
  98. DType bias_dtype, DType dst_dtype, const char* algo,
  99. param::ConvBias::Format format) {
  100. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  101. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  102. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  103. size_t RUNS = 1000;
  104. benchmarker.set_display(false).set_times(RUNS);
  105. benchmarker_cudnn.set_display(false).set_times(RUNS);
  106. #define CUDNN_VERSION_STRING \
  107. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  108. benchmarker_cudnn.set_before_exec_callback(
  109. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  110. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  111. "ALGO_IMPLICIT_PRECOMP_"
  112. "GEMM" CUDNN_VERSION_STRING));
  113. benchmarker.set_dtype(0, src_dtype)
  114. .set_dtype(1, filter_dtype)
  115. .set_dtype(2, bias_dtype)
  116. .set_dtype(3, dst_dtype)
  117. .set_dtype(4, dst_dtype);
  118. benchmarker_cudnn.set_dtype(0, src_dtype)
  119. .set_dtype(1, filter_dtype)
  120. .set_dtype(2, bias_dtype)
  121. .set_dtype(3, dst_dtype)
  122. .set_dtype(4, dst_dtype);
  123. using Param = ConvBias::Param;
  124. using Format = Param::Format;
  125. // helper function to change format
  126. auto get_tensor_shape = [](TensorShape shape,
  127. Format format) -> TensorShape {
  128. TensorShape ret;
  129. if (format == Format::NCHW4) {
  130. ret = static_cast<TensorShape>(
  131. TensorLayout{shape, dtype::Int8()}
  132. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  133. shape[3]})
  134. .dimshuffle({0, 1, 3, 4, 2}));
  135. } else if (format == Format::CHWN4) {
  136. ret = static_cast<TensorShape>(
  137. TensorLayout{shape, dtype::Int8()}
  138. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  139. shape[3]})
  140. .dimshuffle({1, 3, 4, 0, 2}));
  141. }
  142. return ret;
  143. };
  144. for (auto&& arg : args) {
  145. Param param;
  146. param.pad_h = param.pad_w = arg.f / 2;
  147. param.stride_h = param.stride_w = arg.s;
  148. param.format = format;
  149. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  150. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  151. benchmarker.set_param(param);
  152. if (!algo) {
  153. benchmarker.proxy()->target_execution_policy.algo.reset();
  154. }
  155. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  156. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  157. z{arg.n, arg.co, ho, wo}, dst = z;
  158. float time_in_ms = 0.f;
  159. if (algo) {
  160. time_in_ms =
  161. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  162. CUTimer>(benchmarker,
  163. {get_tensor_shape(src, format),
  164. get_tensor_shape(filter, format),
  165. get_tensor_shape(bias, format),
  166. {},
  167. {}},
  168. algo) /
  169. RUNS;
  170. } else {
  171. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  172. get_tensor_shape(filter, format),
  173. get_tensor_shape(bias, format),
  174. {},
  175. {}}) /
  176. RUNS;
  177. }
  178. Format format_cudnn = Format::NCHW4;
  179. param.format = format_cudnn;
  180. benchmarker_cudnn.set_param(param);
  181. auto time_in_ms_cudnn =
  182. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  183. get_tensor_shape(filter, format_cudnn),
  184. get_tensor_shape(bias, format_cudnn),
  185. {},
  186. {}}) /
  187. RUNS;
  188. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  189. (1e12);
  190. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  191. "time(cudnn)=%.2f %.2fTops, "
  192. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  193. src.to_string().c_str(), filter.to_string().c_str(),
  194. dst.to_string().c_str(), algo, time_in_ms,
  195. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  196. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  197. time_in_ms_cudnn / time_in_ms);
  198. printf("bench with z tensor\n");
  199. if (algo) {
  200. time_in_ms =
  201. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  202. CUTimer>(benchmarker,
  203. {get_tensor_shape(src, format),
  204. get_tensor_shape(filter, format),
  205. get_tensor_shape(bias, format),
  206. get_tensor_shape(z, format),
  207. {}},
  208. algo) /
  209. RUNS;
  210. } else {
  211. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  212. get_tensor_shape(filter, format),
  213. get_tensor_shape(bias, format),
  214. get_tensor_shape(z, format),
  215. {}}) /
  216. RUNS;
  217. }
  218. time_in_ms_cudnn =
  219. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  220. get_tensor_shape(filter, format_cudnn),
  221. get_tensor_shape(bias, format_cudnn),
  222. get_tensor_shape(z, format_cudnn),
  223. {}}) /
  224. RUNS;
  225. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  226. "time(cudnn)=%.2f %.2fTops, "
  227. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  228. src.to_string().c_str(), filter.to_string().c_str(),
  229. dst.to_string().c_str(), algo, time_in_ms,
  230. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  231. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  232. time_in_ms_cudnn / time_in_ms);
  233. }
  234. }
  235. void benchmark_target_algo_with_cudnn_tsc(
  236. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  237. DType filter_dtype, DType bias_dtype, DType dst_dtype, const char* algo,
  238. param::ConvBias::Format format, bool with_cudnn,
  239. const char* change_cudnn_algo,
  240. param::ConvBias::Format change_cudnn_format,
  241. DType change_cudnn_src_dtype, DType change_cudnn_filter_dtype,
  242. DType change_cudnn_bias_dtype, DType change_cudnn_dst_dtype) {
  243. megdnn_assert((src_dtype.enumv() == filter_dtype.enumv()) ||
  244. (src_dtype.enumv() == DTypeEnum::Quantized4Asymm &&
  245. filter_dtype.enumv() == DTypeEnum::QuantizedS4));
  246. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  247. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  248. size_t RUNS = 200;
  249. benchmarker.set_display(false).set_times(RUNS);
  250. benchmarker.set_dtype(0, src_dtype)
  251. .set_dtype(1, filter_dtype)
  252. .set_dtype(2, bias_dtype)
  253. .set_dtype(3, dst_dtype)
  254. .set_dtype(4, dst_dtype);
  255. benchmarker_cudnn.set_display(false).set_times(RUNS);
  256. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  257. new OprProxy<ConvBiasForward>{true}};
  258. if (!algo) {
  259. benchmarker.set_proxy(proxy);
  260. }
  261. if (change_cudnn_algo) {
  262. benchmarker_cudnn.set_dtype(0, change_cudnn_src_dtype)
  263. .set_dtype(1, change_cudnn_filter_dtype)
  264. .set_dtype(2, change_cudnn_bias_dtype)
  265. .set_dtype(3, change_cudnn_dst_dtype)
  266. .set_dtype(4, change_cudnn_dst_dtype);
  267. } else {
  268. benchmarker_cudnn.set_dtype(0, src_dtype)
  269. .set_dtype(1, filter_dtype)
  270. .set_dtype(2, bias_dtype)
  271. .set_dtype(3, dst_dtype)
  272. .set_dtype(4, dst_dtype);
  273. benchmarker_cudnn.set_before_exec_callback(
  274. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  275. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_"
  276. "FWD_"
  277. "ALGO_IMPLICIT_PRECOMP_GEMM" CUDNN_VERSION_STRING));
  278. }
  279. #undef CUDNN_VERSION_STRING
  280. using Param = ConvBias::Param;
  281. using Format = Param::Format;
  282. // helper function to change format
  283. auto get_tensor_shape = [](TensorShape shape, DType dtype,
  284. Format format) -> TensorShape {
  285. TensorShape ret;
  286. if (format == Format::NCHW4) {
  287. ret = static_cast<TensorShape>(
  288. TensorLayout{shape, dtype}
  289. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  290. shape[3]})
  291. .dimshuffle({0, 1, 3, 4, 2}));
  292. } else if (format == Format::NCHW32) {
  293. ret = static_cast<TensorShape>(
  294. TensorLayout{shape, dtype}
  295. .reshape({shape[0], shape[1] / 32, 32, shape[2],
  296. shape[3]})
  297. .dimshuffle({0, 1, 3, 4, 2}));
  298. } else if (format == Format::NCHW64) {
  299. ret = static_cast<TensorShape>(
  300. TensorLayout{shape, dtype}
  301. .reshape({shape[0], shape[1] / 64, 64, shape[2],
  302. shape[3]})
  303. .dimshuffle({0, 1, 3, 4, 2}));
  304. } else if (format == Format::CHWN4) {
  305. ret = static_cast<TensorShape>(
  306. TensorLayout{shape, dtype}
  307. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  308. shape[3]})
  309. .dimshuffle({1, 3, 4, 0, 2}));
  310. } else if (format == Format::NHWC) {
  311. ret = static_cast<TensorShape>(
  312. TensorLayout{shape, dtype}.dimshuffle({0, 2, 3, 1}));
  313. }
  314. return ret;
  315. };
  316. for (auto&& arg : args) {
  317. Param param;
  318. param.pad_h = param.pad_w = arg.f / 2;
  319. param.stride_h = param.stride_w = arg.s;
  320. param.format = format;
  321. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  322. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  323. benchmarker.set_param(param);
  324. if (!algo) {
  325. benchmarker.proxy()->target_execution_policy.algo.reset();
  326. }
  327. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  328. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  329. z{arg.n, arg.co, ho, wo}, dst = z;
  330. // skip testcase which cannot enable nchw32 tensorcore
  331. if (format == Format::NCHW32 && (arg.co % 32 != 0 || arg.ci % 32 != 0))
  332. continue;
  333. // skip testcase which cannot enable nchw32 tensorcore
  334. if (format == Format::NCHW64 && (arg.co % 64 != 0 || arg.ci % 64 != 0))
  335. continue;
  336. // skip testcase which cannot enable nchw4/chwn4 tensorcore
  337. if ((format == Format::CHWN4 || format == Format::NCHW4) &&
  338. (arg.ci % 16 != 0))
  339. continue;
  340. // skip testcase which cannot enable nhwc tensorcore
  341. if ((format == Format::NHWC) && (arg.ci % 4 != 0 || arg.co % 4 != 0))
  342. continue;
  343. Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
  344. ? Format::NCHW32
  345. : Format::NCHW4;
  346. if (change_cudnn_algo) {
  347. format_cudnn = change_cudnn_format;
  348. }
  349. param.format = format_cudnn;
  350. benchmarker_cudnn.set_param(param);
  351. float time_in_ms = 0.f;
  352. if (algo) {
  353. time_in_ms =
  354. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  355. CUTimer>(
  356. benchmarker,
  357. {get_tensor_shape(src, src_dtype, format),
  358. get_tensor_shape(filter, filter_dtype, format),
  359. get_tensor_shape(bias, bias_dtype, format),
  360. {},
  361. {}},
  362. algo) /
  363. RUNS;
  364. } else {
  365. time_in_ms =
  366. benchmarker.execs(
  367. {get_tensor_shape(src, src_dtype, format),
  368. get_tensor_shape(filter, filter_dtype, format),
  369. get_tensor_shape(bias, bias_dtype, format),
  370. {},
  371. {}}) /
  372. RUNS;
  373. }
  374. float time_in_ms_cudnn = 0;
  375. if (with_cudnn) {
  376. if (change_cudnn_algo) {
  377. time_in_ms_cudnn =
  378. algo_benchmark<ConvBiasForward,
  379. OprProxy<ConvBiasForward>, CUTimer>(
  380. benchmarker_cudnn,
  381. {get_tensor_shape(src, src_dtype, format_cudnn),
  382. get_tensor_shape(filter, filter_dtype,
  383. format_cudnn),
  384. get_tensor_shape(bias, bias_dtype,
  385. format_cudnn),
  386. {},
  387. {}},
  388. change_cudnn_algo) /
  389. RUNS;
  390. } else {
  391. time_in_ms_cudnn =
  392. benchmarker_cudnn.execs(
  393. {get_tensor_shape(src, src_dtype, format_cudnn),
  394. get_tensor_shape(filter, filter_dtype,
  395. format_cudnn),
  396. get_tensor_shape(bias, bias_dtype,
  397. format_cudnn),
  398. {},
  399. {}}) /
  400. RUNS;
  401. }
  402. }
  403. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  404. (1e12);
  405. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  406. "time(cudnn)=%.2f %.2fTops, "
  407. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  408. src.to_string().c_str(), filter.to_string().c_str(),
  409. dst.to_string().c_str(), algo, time_in_ms,
  410. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  411. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  412. time_in_ms_cudnn / time_in_ms);
  413. printf("bench with z tensor\n");
  414. if (algo) {
  415. time_in_ms =
  416. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  417. CUTimer>(
  418. benchmarker,
  419. {get_tensor_shape(src, src_dtype, format),
  420. get_tensor_shape(filter, filter_dtype, format),
  421. get_tensor_shape(bias, bias_dtype, format),
  422. get_tensor_shape(z, src_dtype, format),
  423. {}},
  424. algo) /
  425. RUNS;
  426. } else {
  427. time_in_ms =
  428. benchmarker.execs(
  429. {get_tensor_shape(src, src_dtype, format),
  430. get_tensor_shape(filter, filter_dtype, format),
  431. get_tensor_shape(bias, bias_dtype, format),
  432. get_tensor_shape(z, src_dtype, format),
  433. {}}) /
  434. RUNS;
  435. }
  436. time_in_ms_cudnn = 0;
  437. if (with_cudnn) {
  438. if (change_cudnn_algo) {
  439. time_in_ms_cudnn =
  440. algo_benchmark<ConvBiasForward,
  441. OprProxy<ConvBiasForward>, CUTimer>(
  442. benchmarker_cudnn,
  443. {get_tensor_shape(src, src_dtype, format_cudnn),
  444. get_tensor_shape(filter, filter_dtype,
  445. format_cudnn),
  446. get_tensor_shape(bias, bias_dtype,
  447. format_cudnn),
  448. get_tensor_shape(z, src_dtype, format_cudnn),
  449. {}},
  450. change_cudnn_algo) /
  451. RUNS;
  452. } else {
  453. time_in_ms_cudnn =
  454. benchmarker_cudnn.execs(
  455. {get_tensor_shape(src, src_dtype, format_cudnn),
  456. get_tensor_shape(filter, filter_dtype,
  457. format_cudnn),
  458. get_tensor_shape(bias, bias_dtype,
  459. format_cudnn),
  460. get_tensor_shape(z, src_dtype, format_cudnn),
  461. {}}) /
  462. RUNS;
  463. }
  464. }
  465. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  466. "time(cudnn)=%.2f %.2fTops, "
  467. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  468. src.to_string().c_str(), filter.to_string().c_str(),
  469. dst.to_string().c_str(), algo, time_in_ms,
  470. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  471. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  472. time_in_ms_cudnn / time_in_ms);
  473. }
  474. }
  475. #endif
  476. } // namespace conv
  477. } // namespace test
  478. } // namespace megdnn
  479. #undef V1
  480. #undef V

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台