You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_test_utils.cpp 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. /**
  2. * \file dnn/test/cuda/conv_test_utils.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/oprs/nn.h"
  13. #include "src/common/utils.h"
  14. #include "src/cuda/cudnn_with_check.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/conv_bias.h"
  17. #include "test/common/tensor.h"
  18. #include "test/common/workspace_wrapper.h"
  19. #include "test/cuda/benchmark.h"
  20. #include "test/cuda/conv_test_utils.h"
  21. #include "test/cuda/fixture.h"
  22. #include "test/cuda/utils.h"
  23. #define V1(x) #x
  24. #define V(x) V1(x)
  25. namespace megdnn {
  26. namespace test {
  27. namespace conv {
  28. #if MEGDNN_WITH_BENCHMARK
  29. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch) {
  30. std::vector<BenchArgs> args;
  31. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  32. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  33. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  34. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  35. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  36. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  37. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  38. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  39. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  40. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  41. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  42. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  43. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  44. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  45. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  46. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  47. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  48. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  49. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  50. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 2});
  51. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  52. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  53. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  54. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  55. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  56. return args;
  57. }
  58. std::vector<BenchArgs> get_detection_bench_args(size_t batch) {
  59. std::vector<BenchArgs> args;
  60. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  61. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  62. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  63. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  64. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  66. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  67. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  68. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  69. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  70. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  71. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  72. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  73. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  74. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  75. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  76. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  77. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  78. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  79. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  80. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  81. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  82. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  83. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  84. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  85. return args;
  86. }
  87. std::vector<BenchArgs> get_det_first_bench_args(size_t batch) {
  88. std::vector<BenchArgs> args;
  89. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 16, 3, 2});
  90. args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1});
  91. args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2});
  92. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1});
  93. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1});
  94. return args;
  95. }
  96. void benchmark_target_algo(
  97. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  98. DType filter_dtype, DType bias_dtype, DType dst_dtype, const char* algo,
  99. param::ConvBias::Format format) {
  100. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  101. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  102. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  103. size_t RUNS = 1000;
  104. benchmarker.set_display(false).set_times(RUNS);
  105. benchmarker_cudnn.set_display(false).set_times(RUNS);
  106. #define CUDNN_VERSION_STRING \
  107. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  108. benchmarker_cudnn.set_before_exec_callback(
  109. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  110. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  111. "ALGO_IMPLICIT_PRECOMP_"
  112. "GEMM" CUDNN_VERSION_STRING));
  113. benchmarker.set_dtype(0, src_dtype)
  114. .set_dtype(1, filter_dtype)
  115. .set_dtype(2, bias_dtype)
  116. .set_dtype(3, dst_dtype)
  117. .set_dtype(4, dst_dtype);
  118. benchmarker_cudnn.set_dtype(0, src_dtype)
  119. .set_dtype(1, filter_dtype)
  120. .set_dtype(2, bias_dtype)
  121. .set_dtype(3, dst_dtype)
  122. .set_dtype(4, dst_dtype);
  123. using Param = ConvBias::Param;
  124. using Format = Param::Format;
  125. // helper function to change format
  126. auto get_tensor_shape = [](TensorShape shape, Format format) -> TensorShape {
  127. TensorShape ret;
  128. if (format == Format::NCHW4) {
  129. ret = static_cast<TensorShape>(
  130. TensorLayout{shape, dtype::Int8()}
  131. .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]})
  132. .dimshuffle({0, 1, 3, 4, 2}));
  133. } else if (format == Format::CHWN4) {
  134. ret = static_cast<TensorShape>(
  135. TensorLayout{shape, dtype::Int8()}
  136. .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]})
  137. .dimshuffle({1, 3, 4, 0, 2}));
  138. }
  139. return ret;
  140. };
  141. for (auto&& arg : args) {
  142. Param param;
  143. param.pad_h = param.pad_w = arg.f / 2;
  144. param.stride_h = param.stride_w = arg.s;
  145. param.format = format;
  146. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  147. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  148. benchmarker.set_param(param);
  149. if (!algo) {
  150. benchmarker.proxy()->target_execution_policy.algo.reset();
  151. }
  152. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  153. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  154. z{arg.n, arg.co, ho, wo}, dst = z;
  155. float time_in_ms = 0.f;
  156. if (algo) {
  157. time_in_ms =
  158. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  159. benchmarker,
  160. {get_tensor_shape(src, format),
  161. get_tensor_shape(filter, format),
  162. get_tensor_shape(bias, format),
  163. {},
  164. {}},
  165. algo) /
  166. RUNS;
  167. } else {
  168. time_in_ms = benchmarker.execs(
  169. {get_tensor_shape(src, format),
  170. get_tensor_shape(filter, format),
  171. get_tensor_shape(bias, format),
  172. {},
  173. {}}) /
  174. RUNS;
  175. }
  176. Format format_cudnn = Format::NCHW4;
  177. param.format = format_cudnn;
  178. benchmarker_cudnn.set_param(param);
  179. auto time_in_ms_cudnn = benchmarker_cudnn.execs(
  180. {get_tensor_shape(src, format_cudnn),
  181. get_tensor_shape(filter, format_cudnn),
  182. get_tensor_shape(bias, format_cudnn),
  183. {},
  184. {}}) /
  185. RUNS;
  186. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f / (1e12);
  187. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  188. "time(cudnn)=%.2f %.2fTops, "
  189. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  190. src.to_string().c_str(), filter.to_string().c_str(),
  191. dst.to_string().c_str(), algo, time_in_ms, (flo / (time_in_ms * 1e-3)),
  192. time_in_ms_cudnn, (flo / (time_in_ms_cudnn * 1e-3)), algo,
  193. time_in_ms_cudnn / time_in_ms);
  194. printf("bench with z tensor\n");
  195. if (algo) {
  196. time_in_ms =
  197. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  198. benchmarker,
  199. {get_tensor_shape(src, format),
  200. get_tensor_shape(filter, format),
  201. get_tensor_shape(bias, format),
  202. get_tensor_shape(z, format),
  203. {}},
  204. algo) /
  205. RUNS;
  206. } else {
  207. time_in_ms = benchmarker.execs(
  208. {get_tensor_shape(src, format),
  209. get_tensor_shape(filter, format),
  210. get_tensor_shape(bias, format),
  211. get_tensor_shape(z, format),
  212. {}}) /
  213. RUNS;
  214. }
  215. time_in_ms_cudnn = benchmarker_cudnn.execs(
  216. {get_tensor_shape(src, format_cudnn),
  217. get_tensor_shape(filter, format_cudnn),
  218. get_tensor_shape(bias, format_cudnn),
  219. get_tensor_shape(z, format_cudnn),
  220. {}}) /
  221. RUNS;
  222. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  223. "time(cudnn)=%.2f %.2fTops, "
  224. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  225. src.to_string().c_str(), filter.to_string().c_str(),
  226. dst.to_string().c_str(), algo, time_in_ms, (flo / (time_in_ms * 1e-3)),
  227. time_in_ms_cudnn, (flo / (time_in_ms_cudnn * 1e-3)), algo,
  228. time_in_ms_cudnn / time_in_ms);
  229. }
  230. }
  231. void benchmark_target_algo_with_cudnn_tsc(
  232. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  233. DType filter_dtype, DType bias_dtype, DType dst_dtype, const char* algo,
  234. param::ConvBias::Format format, bool with_cudnn, const char* change_cudnn_algo,
  235. param::ConvBias::Format change_cudnn_format, DType change_cudnn_src_dtype,
  236. DType change_cudnn_filter_dtype, DType change_cudnn_bias_dtype,
  237. DType change_cudnn_dst_dtype) {
  238. megdnn_assert(
  239. (src_dtype.enumv() == filter_dtype.enumv()) ||
  240. (src_dtype.enumv() == DTypeEnum::Quantized4Asymm &&
  241. filter_dtype.enumv() == DTypeEnum::QuantizedS4));
  242. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  243. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  244. size_t RUNS = 200;
  245. benchmarker.set_display(false).set_times(RUNS);
  246. benchmarker.set_dtype(0, src_dtype)
  247. .set_dtype(1, filter_dtype)
  248. .set_dtype(2, bias_dtype)
  249. .set_dtype(3, dst_dtype)
  250. .set_dtype(4, dst_dtype);
  251. benchmarker_cudnn.set_display(false).set_times(RUNS);
  252. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  253. new OprProxy<ConvBiasForward>{true}};
  254. if (!algo) {
  255. benchmarker.set_proxy(proxy);
  256. }
  257. if (change_cudnn_algo) {
  258. benchmarker_cudnn.set_dtype(0, change_cudnn_src_dtype)
  259. .set_dtype(1, change_cudnn_filter_dtype)
  260. .set_dtype(2, change_cudnn_bias_dtype)
  261. .set_dtype(3, change_cudnn_dst_dtype)
  262. .set_dtype(4, change_cudnn_dst_dtype);
  263. } else {
  264. benchmarker_cudnn.set_dtype(0, src_dtype)
  265. .set_dtype(1, filter_dtype)
  266. .set_dtype(2, bias_dtype)
  267. .set_dtype(3, dst_dtype)
  268. .set_dtype(4, dst_dtype);
  269. benchmarker_cudnn.set_before_exec_callback(
  270. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  271. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_"
  272. "FWD_"
  273. "ALGO_IMPLICIT_PRECOMP_GEMM" CUDNN_VERSION_STRING));
  274. }
  275. #undef CUDNN_VERSION_STRING
  276. using Param = ConvBias::Param;
  277. using Format = Param::Format;
  278. // helper function to change format
  279. auto get_tensor_shape = [](TensorShape shape, DType dtype,
  280. Format format) -> TensorShape {
  281. TensorShape ret;
  282. if (format == Format::NCHW4) {
  283. ret = static_cast<TensorShape>(
  284. TensorLayout{shape, dtype}
  285. .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]})
  286. .dimshuffle({0, 1, 3, 4, 2}));
  287. } else if (format == Format::NCHW32) {
  288. ret = static_cast<TensorShape>(
  289. TensorLayout{shape, dtype}
  290. .reshape({shape[0], shape[1] / 32, 32, shape[2], shape[3]})
  291. .dimshuffle({0, 1, 3, 4, 2}));
  292. } else if (format == Format::NCHW64) {
  293. ret = static_cast<TensorShape>(
  294. TensorLayout{shape, dtype}
  295. .reshape({shape[0], shape[1] / 64, 64, shape[2], shape[3]})
  296. .dimshuffle({0, 1, 3, 4, 2}));
  297. } else if (format == Format::CHWN4) {
  298. ret = static_cast<TensorShape>(
  299. TensorLayout{shape, dtype}
  300. .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]})
  301. .dimshuffle({1, 3, 4, 0, 2}));
  302. } else if (format == Format::NHWC) {
  303. ret = static_cast<TensorShape>(
  304. TensorLayout{shape, dtype}.dimshuffle({0, 2, 3, 1}));
  305. }
  306. return ret;
  307. };
  308. for (auto&& arg : args) {
  309. Param param;
  310. param.pad_h = param.pad_w = arg.f / 2;
  311. param.stride_h = param.stride_w = arg.s;
  312. param.format = format;
  313. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  314. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  315. benchmarker.set_param(param);
  316. if (!algo) {
  317. benchmarker.proxy()->target_execution_policy.algo.reset();
  318. }
  319. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  320. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  321. z{arg.n, arg.co, ho, wo}, dst = z;
  322. // skip testcase which cannot enable nchw32 tensorcore
  323. if (format == Format::NCHW32 && (arg.co % 32 != 0 || arg.ci % 32 != 0))
  324. continue;
  325. // skip testcase which cannot enable nchw32 tensorcore
  326. if (format == Format::NCHW64 && (arg.co % 64 != 0 || arg.ci % 64 != 0))
  327. continue;
  328. // skip testcase which cannot enable nchw4/chwn4 tensorcore
  329. if ((format == Format::CHWN4 || format == Format::NCHW4) && (arg.ci % 16 != 0))
  330. continue;
  331. // skip testcase which cannot enable nhwc tensorcore
  332. if ((format == Format::NHWC) && (arg.ci % 4 != 0 || arg.co % 4 != 0))
  333. continue;
  334. Format format_cudnn =
  335. arg.ci % 32 == 0 && arg.co % 32 == 0 ? Format::NCHW32 : Format::NCHW4;
  336. if (change_cudnn_algo) {
  337. format_cudnn = change_cudnn_format;
  338. }
  339. param.format = format_cudnn;
  340. benchmarker_cudnn.set_param(param);
  341. float time_in_ms = 0.f;
  342. if (algo) {
  343. time_in_ms =
  344. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  345. benchmarker,
  346. {get_tensor_shape(src, src_dtype, format),
  347. get_tensor_shape(filter, filter_dtype, format),
  348. get_tensor_shape(bias, bias_dtype, format),
  349. {},
  350. {}},
  351. algo) /
  352. RUNS;
  353. } else {
  354. time_in_ms = benchmarker.execs(
  355. {get_tensor_shape(src, src_dtype, format),
  356. get_tensor_shape(filter, filter_dtype, format),
  357. get_tensor_shape(bias, bias_dtype, format),
  358. {},
  359. {}}) /
  360. RUNS;
  361. }
  362. float time_in_ms_cudnn = 0;
  363. if (with_cudnn) {
  364. if (change_cudnn_algo) {
  365. time_in_ms_cudnn =
  366. algo_benchmark<
  367. ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  368. benchmarker_cudnn,
  369. {get_tensor_shape(src, src_dtype, format_cudnn),
  370. get_tensor_shape(filter, filter_dtype, format_cudnn),
  371. get_tensor_shape(bias, bias_dtype, format_cudnn),
  372. {},
  373. {}},
  374. change_cudnn_algo) /
  375. RUNS;
  376. } else {
  377. time_in_ms_cudnn =
  378. benchmarker_cudnn.execs(
  379. {get_tensor_shape(src, src_dtype, format_cudnn),
  380. get_tensor_shape(filter, filter_dtype, format_cudnn),
  381. get_tensor_shape(bias, bias_dtype, format_cudnn),
  382. {},
  383. {}}) /
  384. RUNS;
  385. }
  386. }
  387. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f / (1e12);
  388. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  389. "time(cudnn)=%.2f %.2fTops, "
  390. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  391. src.to_string().c_str(), filter.to_string().c_str(),
  392. dst.to_string().c_str(), algo, time_in_ms, (flo / (time_in_ms * 1e-3)),
  393. time_in_ms_cudnn, (flo / (time_in_ms_cudnn * 1e-3)), algo,
  394. time_in_ms_cudnn / time_in_ms);
  395. printf("bench with z tensor\n");
  396. if (algo) {
  397. time_in_ms =
  398. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  399. benchmarker,
  400. {get_tensor_shape(src, src_dtype, format),
  401. get_tensor_shape(filter, filter_dtype, format),
  402. get_tensor_shape(bias, bias_dtype, format),
  403. get_tensor_shape(z, src_dtype, format),
  404. {}},
  405. algo) /
  406. RUNS;
  407. } else {
  408. time_in_ms = benchmarker.execs(
  409. {get_tensor_shape(src, src_dtype, format),
  410. get_tensor_shape(filter, filter_dtype, format),
  411. get_tensor_shape(bias, bias_dtype, format),
  412. get_tensor_shape(z, src_dtype, format),
  413. {}}) /
  414. RUNS;
  415. }
  416. time_in_ms_cudnn = 0;
  417. if (with_cudnn) {
  418. if (change_cudnn_algo) {
  419. time_in_ms_cudnn =
  420. algo_benchmark<
  421. ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  422. benchmarker_cudnn,
  423. {get_tensor_shape(src, src_dtype, format_cudnn),
  424. get_tensor_shape(filter, filter_dtype, format_cudnn),
  425. get_tensor_shape(bias, bias_dtype, format_cudnn),
  426. get_tensor_shape(z, src_dtype, format_cudnn),
  427. {}},
  428. change_cudnn_algo) /
  429. RUNS;
  430. } else {
  431. time_in_ms_cudnn =
  432. benchmarker_cudnn.execs(
  433. {get_tensor_shape(src, src_dtype, format_cudnn),
  434. get_tensor_shape(filter, filter_dtype, format_cudnn),
  435. get_tensor_shape(bias, bias_dtype, format_cudnn),
  436. get_tensor_shape(z, src_dtype, format_cudnn),
  437. {}}) /
  438. RUNS;
  439. }
  440. }
  441. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  442. "time(cudnn)=%.2f %.2fTops, "
  443. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  444. src.to_string().c_str(), filter.to_string().c_str(),
  445. dst.to_string().c_str(), algo, time_in_ms, (flo / (time_in_ms * 1e-3)),
  446. time_in_ms_cudnn, (flo / (time_in_ms_cudnn * 1e-3)), algo,
  447. time_in_ms_cudnn / time_in_ms);
  448. }
  449. }
  450. #endif
  451. } // namespace conv
  452. } // namespace test
  453. } // namespace megdnn
  454. #undef V1
  455. #undef V