You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_test_utils.cpp 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. /**
  2. * \file dnn/test/cuda/conv_test_utils.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/oprs/nn.h"
  13. #include "src/common/utils.h"
  14. #include "src/cuda/cudnn_with_check.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/conv_bias.h"
  17. #include "test/common/tensor.h"
  18. #include "test/common/workspace_wrapper.h"
  19. #include "test/cuda/benchmark.h"
  20. #include "test/cuda/conv_test_utils.h"
  21. #include "test/cuda/fixture.h"
  22. #include "test/cuda/utils.h"
  23. #define V1(x) #x
  24. #define V(x) V1(x)
  25. namespace megdnn {
  26. namespace test {
  27. namespace conv {
  28. #if MEGDNN_WITH_BENCHMARK
  29. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch) {
  30. std::vector<BenchArgs> args;
  31. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  32. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  33. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  34. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  35. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  36. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  37. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  38. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  39. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  40. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  41. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  42. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  43. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  44. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  45. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  46. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  47. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  48. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  49. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  50. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 2});
  51. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  52. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  53. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  54. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  55. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  56. return args;
  57. }
  58. std::vector<BenchArgs> get_detection_bench_args(size_t batch) {
  59. std::vector<BenchArgs> args;
  60. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  61. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  62. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  63. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  64. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  66. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  67. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  68. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  69. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  70. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  71. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  72. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  73. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  74. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  75. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  76. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  77. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  78. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  79. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  80. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  81. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  82. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  83. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  84. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  85. return args;
  86. }
  87. std::vector<BenchArgs> get_det_first_bench_args(size_t batch) {
  88. std::vector<BenchArgs> args;
  89. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 16, 3, 2});
  90. args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1});
  91. args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2});
  92. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1});
  93. args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 3, 2});
  94. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1});
  95. args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 1, 2});
  96. return args;
  97. }
  98. void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args,
  99. DType src_dtype, DType filter_dtype,
  100. DType bias_dtype, DType dst_dtype, const char* algo,
  101. param::ConvBias::Format format) {
  102. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  103. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  104. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  105. size_t RUNS = 1000;
  106. benchmarker.set_display(false).set_times(RUNS);
  107. benchmarker_cudnn.set_display(false).set_times(RUNS);
  108. #define CUDNN_VERSION_STRING \
  109. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  110. benchmarker_cudnn.set_before_exec_callback(
  111. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  112. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  113. "ALGO_IMPLICIT_PRECOMP_"
  114. "GEMM" CUDNN_VERSION_STRING));
  115. benchmarker.set_dtype(0, src_dtype)
  116. .set_dtype(1, filter_dtype)
  117. .set_dtype(2, bias_dtype)
  118. .set_dtype(3, dst_dtype)
  119. .set_dtype(4, dst_dtype);
  120. benchmarker_cudnn.set_dtype(0, src_dtype)
  121. .set_dtype(1, filter_dtype)
  122. .set_dtype(2, bias_dtype)
  123. .set_dtype(3, dst_dtype)
  124. .set_dtype(4, dst_dtype);
  125. using Param = ConvBias::Param;
  126. using Format = Param::Format;
  127. // helper function to change format
  128. auto get_tensor_shape = [](TensorShape shape,
  129. Format format) -> TensorShape {
  130. TensorShape ret;
  131. if (format == Format::NCHW4) {
  132. ret = static_cast<TensorShape>(
  133. TensorLayout{shape, dtype::Int8()}
  134. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  135. shape[3]})
  136. .dimshuffle({0, 1, 3, 4, 2}));
  137. } else if (format == Format::CHWN4) {
  138. ret = static_cast<TensorShape>(
  139. TensorLayout{shape, dtype::Int8()}
  140. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  141. shape[3]})
  142. .dimshuffle({1, 3, 4, 0, 2}));
  143. }
  144. return ret;
  145. };
  146. for (auto&& arg : args) {
  147. Param param;
  148. param.pad_h = param.pad_w = arg.f / 2;
  149. param.stride_h = param.stride_w = arg.s;
  150. param.format = format;
  151. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  152. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  153. benchmarker.set_param(param);
  154. if (!algo) {
  155. benchmarker.proxy()->target_execution_policy.algo.reset();
  156. }
  157. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  158. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  159. z{arg.n, arg.co, ho, wo}, dst = z;
  160. float time_in_ms = 0.f;
  161. if (algo) {
  162. time_in_ms =
  163. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  164. CUTimer>(benchmarker,
  165. {get_tensor_shape(src, format),
  166. get_tensor_shape(filter, format),
  167. get_tensor_shape(bias, format),
  168. {},
  169. {}},
  170. algo) /
  171. RUNS;
  172. } else {
  173. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  174. get_tensor_shape(filter, format),
  175. get_tensor_shape(bias, format),
  176. {},
  177. {}}) /
  178. RUNS;
  179. }
  180. Format format_cudnn = Format::NCHW4;
  181. param.format = format_cudnn;
  182. benchmarker_cudnn.set_param(param);
  183. auto time_in_ms_cudnn =
  184. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  185. get_tensor_shape(filter, format_cudnn),
  186. get_tensor_shape(bias, format_cudnn),
  187. {},
  188. {}}) /
  189. RUNS;
  190. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  191. (1e12);
  192. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  193. "time(cudnn)=%.2f %.2fTops, "
  194. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  195. src.to_string().c_str(), filter.to_string().c_str(),
  196. dst.to_string().c_str(), algo, time_in_ms,
  197. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  198. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  199. time_in_ms_cudnn / time_in_ms);
  200. printf("bench with z tensor\n");
  201. if (algo) {
  202. time_in_ms =
  203. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  204. CUTimer>(benchmarker,
  205. {get_tensor_shape(src, format),
  206. get_tensor_shape(filter, format),
  207. get_tensor_shape(bias, format),
  208. get_tensor_shape(z, format),
  209. {}},
  210. algo) /
  211. RUNS;
  212. } else {
  213. time_in_ms = benchmarker.execs({get_tensor_shape(src, format),
  214. get_tensor_shape(filter, format),
  215. get_tensor_shape(bias, format),
  216. get_tensor_shape(z, format),
  217. {}}) /
  218. RUNS;
  219. }
  220. time_in_ms_cudnn =
  221. benchmarker_cudnn.execs({get_tensor_shape(src, format_cudnn),
  222. get_tensor_shape(filter, format_cudnn),
  223. get_tensor_shape(bias, format_cudnn),
  224. get_tensor_shape(z, format_cudnn),
  225. {}}) /
  226. RUNS;
  227. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  228. "time(cudnn)=%.2f %.2fTops, "
  229. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  230. src.to_string().c_str(), filter.to_string().c_str(),
  231. dst.to_string().c_str(), algo, time_in_ms,
  232. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  233. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  234. time_in_ms_cudnn / time_in_ms);
  235. }
  236. }
  237. void benchmark_target_algo_with_cudnn_tsc(
  238. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  239. DType filter_dtype, DType bias_dtype, DType dst_dtype, const char* algo,
  240. param::ConvBias::Format format, bool with_cudnn,
  241. const char* change_cudnn_algo,
  242. param::ConvBias::Format change_cudnn_format,
  243. DType change_cudnn_src_dtype, DType change_cudnn_filter_dtype,
  244. DType change_cudnn_bias_dtype, DType change_cudnn_dst_dtype) {
  245. megdnn_assert((src_dtype.enumv() == filter_dtype.enumv()) ||
  246. (src_dtype.enumv() == DTypeEnum::Quantized4Asymm &&
  247. filter_dtype.enumv() == DTypeEnum::QuantizedS4));
  248. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  249. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  250. size_t RUNS = 200;
  251. benchmarker.set_display(false).set_times(RUNS);
  252. benchmarker.set_dtype(0, src_dtype)
  253. .set_dtype(1, filter_dtype)
  254. .set_dtype(2, bias_dtype)
  255. .set_dtype(3, dst_dtype)
  256. .set_dtype(4, dst_dtype);
  257. benchmarker_cudnn.set_display(false).set_times(RUNS);
  258. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  259. new OprProxy<ConvBiasForward>{true}};
  260. if (!algo) {
  261. benchmarker.set_proxy(proxy);
  262. }
  263. if (change_cudnn_algo) {
  264. benchmarker_cudnn.set_dtype(0, change_cudnn_src_dtype)
  265. .set_dtype(1, change_cudnn_filter_dtype)
  266. .set_dtype(2, change_cudnn_bias_dtype)
  267. .set_dtype(3, change_cudnn_dst_dtype)
  268. .set_dtype(4, change_cudnn_dst_dtype);
  269. } else {
  270. benchmarker_cudnn.set_dtype(0, src_dtype)
  271. .set_dtype(1, filter_dtype)
  272. .set_dtype(2, bias_dtype)
  273. .set_dtype(3, dst_dtype)
  274. .set_dtype(4, dst_dtype);
  275. benchmarker_cudnn.set_before_exec_callback(
  276. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  277. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_"
  278. "FWD_"
  279. "ALGO_IMPLICIT_PRECOMP_GEMM" CUDNN_VERSION_STRING));
  280. }
  281. #undef CUDNN_VERSION_STRING
  282. using Param = ConvBias::Param;
  283. using Format = Param::Format;
  284. // helper function to change format
  285. auto get_tensor_shape = [](TensorShape shape, DType dtype,
  286. Format format) -> TensorShape {
  287. TensorShape ret;
  288. if (format == Format::NCHW4) {
  289. ret = static_cast<TensorShape>(
  290. TensorLayout{shape, dtype}
  291. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  292. shape[3]})
  293. .dimshuffle({0, 1, 3, 4, 2}));
  294. } else if (format == Format::NCHW32) {
  295. ret = static_cast<TensorShape>(
  296. TensorLayout{shape, dtype}
  297. .reshape({shape[0], shape[1] / 32, 32, shape[2],
  298. shape[3]})
  299. .dimshuffle({0, 1, 3, 4, 2}));
  300. } else if (format == Format::NCHW64) {
  301. ret = static_cast<TensorShape>(
  302. TensorLayout{shape, dtype}
  303. .reshape({shape[0], shape[1] / 64, 64, shape[2],
  304. shape[3]})
  305. .dimshuffle({0, 1, 3, 4, 2}));
  306. } else if (format == Format::CHWN4) {
  307. ret = static_cast<TensorShape>(
  308. TensorLayout{shape, dtype}
  309. .reshape({shape[0], shape[1] / 4, 4, shape[2],
  310. shape[3]})
  311. .dimshuffle({1, 3, 4, 0, 2}));
  312. }
  313. return ret;
  314. };
  315. for (auto&& arg : args) {
  316. Param param;
  317. param.pad_h = param.pad_w = arg.f / 2;
  318. param.stride_h = param.stride_w = arg.s;
  319. param.format = format;
  320. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  321. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  322. benchmarker.set_param(param);
  323. if (!algo) {
  324. benchmarker.proxy()->target_execution_policy.algo.reset();
  325. }
  326. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  327. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  328. z{arg.n, arg.co, ho, wo}, dst = z;
  329. // skip testcase which cannot enable nchw32 tensorcore
  330. if (format == Format::NCHW32 && (arg.co % 32 != 0 || arg.ci % 32 != 0))
  331. continue;
  332. // skip testcase which cannot enable nchw32 tensorcore
  333. if (format == Format::NCHW64 && (arg.co % 64 != 0 || arg.ci % 64 != 0))
  334. continue;
  335. // skip testcase which cannot enable nchw4/chwn4 tensorcore
  336. if ((format == Format::CHWN4 || format == Format::NCHW4) &&
  337. (arg.ci % 16 != 0))
  338. continue;
  339. Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
  340. ? Format::NCHW32
  341. : Format::NCHW4;
  342. if (change_cudnn_algo) {
  343. format_cudnn = change_cudnn_format;
  344. }
  345. param.format = format_cudnn;
  346. benchmarker_cudnn.set_param(param);
  347. float time_in_ms = 0.f;
  348. if (algo) {
  349. time_in_ms =
  350. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  351. CUTimer>(
  352. benchmarker,
  353. {get_tensor_shape(src, src_dtype, format),
  354. get_tensor_shape(filter, filter_dtype, format),
  355. get_tensor_shape(bias, bias_dtype, format),
  356. {},
  357. {}},
  358. algo) /
  359. RUNS;
  360. } else {
  361. time_in_ms =
  362. benchmarker.execs(
  363. {get_tensor_shape(src, src_dtype, format),
  364. get_tensor_shape(filter, filter_dtype, format),
  365. get_tensor_shape(bias, bias_dtype, format),
  366. {},
  367. {}}) /
  368. RUNS;
  369. }
  370. float time_in_ms_cudnn = 0;
  371. if (with_cudnn) {
  372. if (change_cudnn_algo) {
  373. time_in_ms_cudnn =
  374. algo_benchmark<ConvBiasForward,
  375. OprProxy<ConvBiasForward>, CUTimer>(
  376. benchmarker_cudnn,
  377. {get_tensor_shape(src, src_dtype, format_cudnn),
  378. get_tensor_shape(filter, filter_dtype,
  379. format_cudnn),
  380. get_tensor_shape(bias, bias_dtype,
  381. format_cudnn),
  382. {},
  383. {}},
  384. change_cudnn_algo) /
  385. RUNS;
  386. } else {
  387. time_in_ms_cudnn =
  388. benchmarker_cudnn.execs(
  389. {get_tensor_shape(src, src_dtype, format_cudnn),
  390. get_tensor_shape(filter, filter_dtype,
  391. format_cudnn),
  392. get_tensor_shape(bias, bias_dtype,
  393. format_cudnn),
  394. {},
  395. {}}) /
  396. RUNS;
  397. }
  398. }
  399. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f /
  400. (1e12);
  401. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  402. "time(cudnn)=%.2f %.2fTops, "
  403. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  404. src.to_string().c_str(), filter.to_string().c_str(),
  405. dst.to_string().c_str(), algo, time_in_ms,
  406. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  407. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  408. time_in_ms_cudnn / time_in_ms);
  409. printf("bench with z tensor\n");
  410. if (algo) {
  411. time_in_ms =
  412. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>,
  413. CUTimer>(
  414. benchmarker,
  415. {get_tensor_shape(src, src_dtype, format),
  416. get_tensor_shape(filter, filter_dtype, format),
  417. get_tensor_shape(bias, bias_dtype, format),
  418. get_tensor_shape(z, src_dtype, format),
  419. {}},
  420. algo) /
  421. RUNS;
  422. } else {
  423. time_in_ms =
  424. benchmarker.execs(
  425. {get_tensor_shape(src, src_dtype, format),
  426. get_tensor_shape(filter, filter_dtype, format),
  427. get_tensor_shape(bias, bias_dtype, format),
  428. get_tensor_shape(z, src_dtype, format),
  429. {}}) /
  430. RUNS;
  431. }
  432. time_in_ms_cudnn = 0;
  433. if (with_cudnn) {
  434. if (change_cudnn_algo) {
  435. time_in_ms_cudnn =
  436. algo_benchmark<ConvBiasForward,
  437. OprProxy<ConvBiasForward>, CUTimer>(
  438. benchmarker_cudnn,
  439. {get_tensor_shape(src, src_dtype, format_cudnn),
  440. get_tensor_shape(filter, filter_dtype,
  441. format_cudnn),
  442. get_tensor_shape(bias, bias_dtype,
  443. format_cudnn),
  444. get_tensor_shape(z, src_dtype, format_cudnn),
  445. {}},
  446. change_cudnn_algo) /
  447. RUNS;
  448. } else {
  449. time_in_ms_cudnn =
  450. benchmarker_cudnn.execs(
  451. {get_tensor_shape(src, src_dtype, format_cudnn),
  452. get_tensor_shape(filter, filter_dtype,
  453. format_cudnn),
  454. get_tensor_shape(bias, bias_dtype,
  455. format_cudnn),
  456. get_tensor_shape(z, src_dtype, format_cudnn),
  457. {}}) /
  458. RUNS;
  459. }
  460. }
  461. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  462. "time(cudnn)=%.2f %.2fTops, "
  463. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  464. src.to_string().c_str(), filter.to_string().c_str(),
  465. dst.to_string().c_str(), algo, time_in_ms,
  466. (flo / (time_in_ms * 1e-3)), time_in_ms_cudnn,
  467. (flo / (time_in_ms_cudnn * 1e-3)), algo,
  468. time_in_ms_cudnn / time_in_ms);
  469. }
  470. }
  471. #endif
  472. } // namespace conv
  473. } // namespace test
  474. } // namespace megdnn
  475. #undef V1
  476. #undef V

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台