You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

conv_test_utils.cpp 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. #include "megdnn/oprs/nn.h"
  2. #include "src/common/utils.h"
  3. #include "src/cuda/cudnn_with_check.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/conv_bias.h"
  6. #include "test/common/tensor.h"
  7. #include "test/common/workspace_wrapper.h"
  8. #include "test/cuda/benchmark.h"
  9. #include "test/cuda/conv_test_utils.h"
  10. #include "test/cuda/fixture.h"
  11. #include "test/cuda/utils.h"
  12. #define V1(x) #x
  13. #define V(x) V1(x)
  14. namespace megdnn {
  15. namespace test {
  16. namespace conv {
  17. #if MEGDNN_WITH_BENCHMARK
  18. std::vector<BenchArgs> get_resnet50_bench_args(size_t batch) {
  19. std::vector<BenchArgs> args;
  20. args.emplace_back(BenchArgs{batch, 64, 56, 56, 256, 1, 1});
  21. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 1});
  22. args.emplace_back(BenchArgs{batch, 256, 56, 56, 32, 3, 2});
  23. args.emplace_back(BenchArgs{batch, 4, 256, 256, 32, 7, 2});
  24. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 1, 1});
  25. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 1, 1});
  26. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 1});
  27. args.emplace_back(BenchArgs{batch, 64, 56, 56, 64, 3, 2});
  28. args.emplace_back(BenchArgs{batch, 256, 56, 56, 64, 3, 2});
  29. args.emplace_back(BenchArgs{batch, 256, 56, 56, 512, 1, 2});
  30. args.emplace_back(BenchArgs{batch, 256, 56, 56, 128, 1, 2});
  31. args.emplace_back(BenchArgs{batch, 512, 28, 28, 128, 1, 1});
  32. args.emplace_back(BenchArgs{batch, 128, 28, 28, 128, 3, 1});
  33. args.emplace_back(BenchArgs{batch, 128, 28, 28, 512, 1, 1});
  34. args.emplace_back(BenchArgs{batch, 512, 28, 28, 1024, 1, 2});
  35. args.emplace_back(BenchArgs{batch, 512, 28, 28, 256, 1, 2});
  36. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 256, 1, 1});
  37. args.emplace_back(BenchArgs{batch, 256, 14, 14, 256, 3, 1});
  38. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 1});
  39. args.emplace_back(BenchArgs{batch, 256, 14, 14, 1024, 1, 2});
  40. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 2048, 1, 2});
  41. args.emplace_back(BenchArgs{batch, 1024, 14, 14, 512, 1, 2});
  42. args.emplace_back(BenchArgs{batch, 2048, 7, 7, 512, 1, 1});
  43. args.emplace_back(BenchArgs{batch, 512, 7, 7, 512, 3, 1});
  44. args.emplace_back(BenchArgs{batch, 512, 7, 7, 2048, 1, 1});
  45. return args;
  46. }
  47. std::vector<BenchArgs> get_detection_bench_args(size_t batch) {
  48. std::vector<BenchArgs> args;
  49. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 8, 3, 2});
  50. args.emplace_back(BenchArgs{batch, 32, 184, 320, 16, 3, 1});
  51. args.emplace_back(BenchArgs{batch, 16, 184, 320, 32, 3, 1});
  52. args.emplace_back(BenchArgs{batch, 8, 184, 320, 16, 3, 1});
  53. args.emplace_back(BenchArgs{batch, 8, 184, 320, 32, 3, 1});
  54. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 1});
  55. args.emplace_back(BenchArgs{batch, 32, 184, 320, 64, 3, 2});
  56. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 2});
  57. args.emplace_back(BenchArgs{batch, 32, 92, 160, 64, 3, 1});
  58. args.emplace_back(BenchArgs{batch, 64, 92, 160, 8, 3, 1});
  59. args.emplace_back(BenchArgs{batch, 64, 92, 160, 128, 3, 2});
  60. args.emplace_back(BenchArgs{batch, 128, 46, 80, 32, 3, 1});
  61. args.emplace_back(BenchArgs{batch, 128, 46, 80, 256, 3, 2});
  62. args.emplace_back(BenchArgs{batch, 128, 46, 80, 8, 3, 1});
  63. args.emplace_back(BenchArgs{batch, 64, 92, 160, 32, 3, 2});
  64. args.emplace_back(BenchArgs{batch, 32, 46, 80, 128, 3, 1});
  65. args.emplace_back(BenchArgs{batch, 8, 46, 80, 32, 3, 1});
  66. args.emplace_back(BenchArgs{batch, 64, 23, 40, 256, 3, 1});
  67. args.emplace_back(BenchArgs{batch, 256, 23, 40, 64, 3, 1});
  68. args.emplace_back(BenchArgs{batch, 128, 46, 80, 64, 3, 2});
  69. args.emplace_back(BenchArgs{batch, 256, 23, 40, 8, 3, 1});
  70. args.emplace_back(BenchArgs{batch, 8, 23, 40, 32, 3, 2});
  71. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 1});
  72. args.emplace_back(BenchArgs{batch, 8, 12, 20, 8, 3, 2});
  73. args.emplace_back(BenchArgs{batch, 8, 6, 10, 8, 3, 1});
  74. return args;
  75. }
  76. std::vector<BenchArgs> get_det_first_bench_args(size_t batch) {
  77. std::vector<BenchArgs> args;
  78. args.emplace_back(BenchArgs{batch, 4, 736, 1280, 16, 3, 2});
  79. args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1});
  80. args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2});
  81. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1});
  82. args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1});
  83. return args;
  84. }
  85. void benchmark_target_algo(
  86. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  87. DType filter_dtype, DType bias_dtype, DType dst_dtype, const char* algo,
  88. param::ConvBias::Format format) {
  89. megdnn_assert(src_dtype.enumv() == filter_dtype.enumv());
  90. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  91. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  92. size_t RUNS = 1000;
  93. benchmarker.set_display(false).set_times(RUNS);
  94. benchmarker_cudnn.set_display(false).set_times(RUNS);
  95. #define CUDNN_VERSION_STRING \
  96. "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL)
  97. benchmarker_cudnn.set_before_exec_callback(
  98. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  99. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_FWD_"
  100. "ALGO_IMPLICIT_PRECOMP_"
  101. "GEMM" CUDNN_VERSION_STRING));
  102. benchmarker.set_dtype(0, src_dtype)
  103. .set_dtype(1, filter_dtype)
  104. .set_dtype(2, bias_dtype)
  105. .set_dtype(3, dst_dtype)
  106. .set_dtype(4, dst_dtype);
  107. benchmarker_cudnn.set_dtype(0, src_dtype)
  108. .set_dtype(1, filter_dtype)
  109. .set_dtype(2, bias_dtype)
  110. .set_dtype(3, dst_dtype)
  111. .set_dtype(4, dst_dtype);
  112. using Param = ConvBias::Param;
  113. using Format = Param::Format;
  114. // helper function to change format
  115. auto get_tensor_shape = [](TensorShape shape, Format format) -> TensorShape {
  116. TensorShape ret;
  117. if (format == Format::NCHW4) {
  118. ret = static_cast<TensorShape>(
  119. TensorLayout{shape, dtype::Int8()}
  120. .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]})
  121. .dimshuffle({0, 1, 3, 4, 2}));
  122. } else if (format == Format::CHWN4) {
  123. ret = static_cast<TensorShape>(
  124. TensorLayout{shape, dtype::Int8()}
  125. .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]})
  126. .dimshuffle({1, 3, 4, 0, 2}));
  127. }
  128. return ret;
  129. };
  130. for (auto&& arg : args) {
  131. Param param;
  132. param.pad_h = param.pad_w = arg.f / 2;
  133. param.stride_h = param.stride_w = arg.s;
  134. param.format = format;
  135. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  136. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  137. benchmarker.set_param(param);
  138. if (!algo) {
  139. benchmarker.proxy()->target_execution_policy.algo.reset();
  140. }
  141. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  142. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  143. z{arg.n, arg.co, ho, wo}, dst = z;
  144. float time_in_ms = 0.f;
  145. if (algo) {
  146. time_in_ms =
  147. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  148. benchmarker,
  149. {get_tensor_shape(src, format),
  150. get_tensor_shape(filter, format),
  151. get_tensor_shape(bias, format),
  152. {},
  153. {}},
  154. algo) /
  155. RUNS;
  156. } else {
  157. time_in_ms = benchmarker.execs(
  158. {get_tensor_shape(src, format),
  159. get_tensor_shape(filter, format),
  160. get_tensor_shape(bias, format),
  161. {},
  162. {}}) /
  163. RUNS;
  164. }
  165. Format format_cudnn = Format::NCHW4;
  166. param.format = format_cudnn;
  167. benchmarker_cudnn.set_param(param);
  168. auto time_in_ms_cudnn = benchmarker_cudnn.execs(
  169. {get_tensor_shape(src, format_cudnn),
  170. get_tensor_shape(filter, format_cudnn),
  171. get_tensor_shape(bias, format_cudnn),
  172. {},
  173. {}}) /
  174. RUNS;
  175. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f / (1e12);
  176. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  177. "time(cudnn)=%.2f %.2fTops, "
  178. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  179. src.to_string().c_str(), filter.to_string().c_str(),
  180. dst.to_string().c_str(), algo, time_in_ms, (flo / (time_in_ms * 1e-3)),
  181. time_in_ms_cudnn, (flo / (time_in_ms_cudnn * 1e-3)), algo,
  182. time_in_ms_cudnn / time_in_ms);
  183. printf("bench with z tensor\n");
  184. if (algo) {
  185. time_in_ms =
  186. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  187. benchmarker,
  188. {get_tensor_shape(src, format),
  189. get_tensor_shape(filter, format),
  190. get_tensor_shape(bias, format),
  191. get_tensor_shape(z, format),
  192. {}},
  193. algo) /
  194. RUNS;
  195. } else {
  196. time_in_ms = benchmarker.execs(
  197. {get_tensor_shape(src, format),
  198. get_tensor_shape(filter, format),
  199. get_tensor_shape(bias, format),
  200. get_tensor_shape(z, format),
  201. {}}) /
  202. RUNS;
  203. }
  204. time_in_ms_cudnn = benchmarker_cudnn.execs(
  205. {get_tensor_shape(src, format_cudnn),
  206. get_tensor_shape(filter, format_cudnn),
  207. get_tensor_shape(bias, format_cudnn),
  208. get_tensor_shape(z, format_cudnn),
  209. {}}) /
  210. RUNS;
  211. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  212. "time(cudnn)=%.2f %.2fTops, "
  213. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  214. src.to_string().c_str(), filter.to_string().c_str(),
  215. dst.to_string().c_str(), algo, time_in_ms, (flo / (time_in_ms * 1e-3)),
  216. time_in_ms_cudnn, (flo / (time_in_ms_cudnn * 1e-3)), algo,
  217. time_in_ms_cudnn / time_in_ms);
  218. }
  219. }
  220. void benchmark_target_algo_with_cudnn_tsc(
  221. Handle* handle, const std::vector<BenchArgs>& args, DType src_dtype,
  222. DType filter_dtype, DType bias_dtype, DType dst_dtype, const char* algo,
  223. param::ConvBias::Format format, bool with_cudnn, const char* change_cudnn_algo,
  224. param::ConvBias::Format change_cudnn_format, DType change_cudnn_src_dtype,
  225. DType change_cudnn_filter_dtype, DType change_cudnn_bias_dtype,
  226. DType change_cudnn_dst_dtype) {
  227. megdnn_assert(
  228. (src_dtype.enumv() == filter_dtype.enumv()) ||
  229. (src_dtype.enumv() == DTypeEnum::Quantized4Asymm &&
  230. filter_dtype.enumv() == DTypeEnum::QuantizedS4));
  231. CUBenchmarker<ConvBiasForward> benchmarker(handle);
  232. CUBenchmarker<ConvBiasForward> benchmarker_cudnn(handle);
  233. size_t RUNS = 200;
  234. benchmarker.set_display(false).set_times(RUNS);
  235. benchmarker.set_dtype(0, src_dtype)
  236. .set_dtype(1, filter_dtype)
  237. .set_dtype(2, bias_dtype)
  238. .set_dtype(3, dst_dtype)
  239. .set_dtype(4, dst_dtype);
  240. benchmarker_cudnn.set_display(false).set_times(RUNS);
  241. std::unique_ptr<OprProxy<ConvBiasForward>> proxy{
  242. new OprProxy<ConvBiasForward>{true}};
  243. if (!algo) {
  244. benchmarker.set_proxy(proxy);
  245. }
  246. if (change_cudnn_algo) {
  247. benchmarker_cudnn.set_dtype(0, change_cudnn_src_dtype)
  248. .set_dtype(1, change_cudnn_filter_dtype)
  249. .set_dtype(2, change_cudnn_bias_dtype)
  250. .set_dtype(3, change_cudnn_dst_dtype)
  251. .set_dtype(4, change_cudnn_dst_dtype);
  252. } else {
  253. benchmarker_cudnn.set_dtype(0, src_dtype)
  254. .set_dtype(1, filter_dtype)
  255. .set_dtype(2, bias_dtype)
  256. .set_dtype(3, dst_dtype)
  257. .set_dtype(4, dst_dtype);
  258. benchmarker_cudnn.set_before_exec_callback(
  259. conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
  260. "DEFAULT:CUDNN:ConvBiasActivation:CUDNN_CONVOLUTION_"
  261. "FWD_"
  262. "ALGO_IMPLICIT_PRECOMP_GEMM" CUDNN_VERSION_STRING));
  263. }
  264. #undef CUDNN_VERSION_STRING
  265. using Param = ConvBias::Param;
  266. using Format = Param::Format;
  267. // helper function to change format
  268. auto get_tensor_shape = [](TensorShape shape, DType dtype,
  269. Format format) -> TensorShape {
  270. TensorShape ret;
  271. if (format == Format::NCHW4) {
  272. ret = static_cast<TensorShape>(
  273. TensorLayout{shape, dtype}
  274. .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]})
  275. .dimshuffle({0, 1, 3, 4, 2}));
  276. } else if (format == Format::NCHW32) {
  277. ret = static_cast<TensorShape>(
  278. TensorLayout{shape, dtype}
  279. .reshape({shape[0], shape[1] / 32, 32, shape[2], shape[3]})
  280. .dimshuffle({0, 1, 3, 4, 2}));
  281. } else if (format == Format::NCHW64) {
  282. ret = static_cast<TensorShape>(
  283. TensorLayout{shape, dtype}
  284. .reshape({shape[0], shape[1] / 64, 64, shape[2], shape[3]})
  285. .dimshuffle({0, 1, 3, 4, 2}));
  286. } else if (format == Format::CHWN4) {
  287. ret = static_cast<TensorShape>(
  288. TensorLayout{shape, dtype}
  289. .reshape({shape[0], shape[1] / 4, 4, shape[2], shape[3]})
  290. .dimshuffle({1, 3, 4, 0, 2}));
  291. } else if (format == Format::NHWC) {
  292. ret = static_cast<TensorShape>(
  293. TensorLayout{shape, dtype}.dimshuffle({0, 2, 3, 1}));
  294. }
  295. return ret;
  296. };
  297. for (auto&& arg : args) {
  298. Param param;
  299. param.pad_h = param.pad_w = arg.f / 2;
  300. param.stride_h = param.stride_w = arg.s;
  301. param.format = format;
  302. size_t ho = infer_conv_shape(arg.hi, arg.f, arg.s, arg.f / 2);
  303. size_t wo = infer_conv_shape(arg.wi, arg.f, arg.s, arg.f / 2);
  304. benchmarker.set_param(param);
  305. if (!algo) {
  306. benchmarker.proxy()->target_execution_policy.algo.reset();
  307. }
  308. TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
  309. filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1},
  310. z{arg.n, arg.co, ho, wo}, dst = z;
  311. // skip testcase which cannot enable nchw32 tensorcore
  312. if (format == Format::NCHW32 && (arg.co % 32 != 0 || arg.ci % 32 != 0))
  313. continue;
  314. // skip testcase which cannot enable nchw32 tensorcore
  315. if (format == Format::NCHW64 && (arg.co % 64 != 0 || arg.ci % 64 != 0))
  316. continue;
  317. // skip testcase which cannot enable nchw4/chwn4 tensorcore
  318. if ((format == Format::CHWN4 || format == Format::NCHW4) && (arg.ci % 16 != 0))
  319. continue;
  320. // skip testcase which cannot enable nhwc tensorcore
  321. if ((format == Format::NHWC) && (arg.ci % 4 != 0 || arg.co % 4 != 0))
  322. continue;
  323. Format format_cudnn =
  324. arg.ci % 32 == 0 && arg.co % 32 == 0 ? Format::NCHW32 : Format::NCHW4;
  325. if (change_cudnn_algo) {
  326. format_cudnn = change_cudnn_format;
  327. }
  328. param.format = format_cudnn;
  329. benchmarker_cudnn.set_param(param);
  330. float time_in_ms = 0.f;
  331. if (algo) {
  332. time_in_ms =
  333. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  334. benchmarker,
  335. {get_tensor_shape(src, src_dtype, format),
  336. get_tensor_shape(filter, filter_dtype, format),
  337. get_tensor_shape(bias, bias_dtype, format),
  338. {},
  339. {}},
  340. algo) /
  341. RUNS;
  342. } else {
  343. time_in_ms = benchmarker.execs(
  344. {get_tensor_shape(src, src_dtype, format),
  345. get_tensor_shape(filter, filter_dtype, format),
  346. get_tensor_shape(bias, bias_dtype, format),
  347. {},
  348. {}}) /
  349. RUNS;
  350. }
  351. float time_in_ms_cudnn = 0;
  352. if (with_cudnn) {
  353. if (change_cudnn_algo) {
  354. time_in_ms_cudnn =
  355. algo_benchmark<
  356. ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  357. benchmarker_cudnn,
  358. {get_tensor_shape(src, src_dtype, format_cudnn),
  359. get_tensor_shape(filter, filter_dtype, format_cudnn),
  360. get_tensor_shape(bias, bias_dtype, format_cudnn),
  361. {},
  362. {}},
  363. change_cudnn_algo) /
  364. RUNS;
  365. } else {
  366. time_in_ms_cudnn =
  367. benchmarker_cudnn.execs(
  368. {get_tensor_shape(src, src_dtype, format_cudnn),
  369. get_tensor_shape(filter, filter_dtype, format_cudnn),
  370. get_tensor_shape(bias, bias_dtype, format_cudnn),
  371. {},
  372. {}}) /
  373. RUNS;
  374. }
  375. }
  376. float flo = 2.0 * arg.n * arg.co * ho * wo * arg.ci * arg.f * arg.f / (1e12);
  377. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  378. "time(cudnn)=%.2f %.2fTops, "
  379. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  380. src.to_string().c_str(), filter.to_string().c_str(),
  381. dst.to_string().c_str(), algo, time_in_ms, (flo / (time_in_ms * 1e-3)),
  382. time_in_ms_cudnn, (flo / (time_in_ms_cudnn * 1e-3)), algo,
  383. time_in_ms_cudnn / time_in_ms);
  384. printf("bench with z tensor\n");
  385. if (algo) {
  386. time_in_ms =
  387. algo_benchmark<ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  388. benchmarker,
  389. {get_tensor_shape(src, src_dtype, format),
  390. get_tensor_shape(filter, filter_dtype, format),
  391. get_tensor_shape(bias, bias_dtype, format),
  392. get_tensor_shape(z, src_dtype, format),
  393. {}},
  394. algo) /
  395. RUNS;
  396. } else {
  397. time_in_ms = benchmarker.execs(
  398. {get_tensor_shape(src, src_dtype, format),
  399. get_tensor_shape(filter, filter_dtype, format),
  400. get_tensor_shape(bias, bias_dtype, format),
  401. get_tensor_shape(z, src_dtype, format),
  402. {}}) /
  403. RUNS;
  404. }
  405. time_in_ms_cudnn = 0;
  406. if (with_cudnn) {
  407. if (change_cudnn_algo) {
  408. time_in_ms_cudnn =
  409. algo_benchmark<
  410. ConvBiasForward, OprProxy<ConvBiasForward>, CUTimer>(
  411. benchmarker_cudnn,
  412. {get_tensor_shape(src, src_dtype, format_cudnn),
  413. get_tensor_shape(filter, filter_dtype, format_cudnn),
  414. get_tensor_shape(bias, bias_dtype, format_cudnn),
  415. get_tensor_shape(z, src_dtype, format_cudnn),
  416. {}},
  417. change_cudnn_algo) /
  418. RUNS;
  419. } else {
  420. time_in_ms_cudnn =
  421. benchmarker_cudnn.execs(
  422. {get_tensor_shape(src, src_dtype, format_cudnn),
  423. get_tensor_shape(filter, filter_dtype, format_cudnn),
  424. get_tensor_shape(bias, bias_dtype, format_cudnn),
  425. get_tensor_shape(z, src_dtype, format_cudnn),
  426. {}}) /
  427. RUNS;
  428. }
  429. }
  430. printf("src=%s, filter=%s, dst=%s, time(algo=%s)=%.2f %.2fTops, "
  431. "time(cudnn)=%.2f %.2fTops, "
  432. "perf(algo=%s)/perf(cudnn)=%.2f\n",
  433. src.to_string().c_str(), filter.to_string().c_str(),
  434. dst.to_string().c_str(), algo, time_in_ms, (flo / (time_in_ms * 1e-3)),
  435. time_in_ms_cudnn, (flo / (time_in_ms_cudnn * 1e-3)), algo,
  436. time_in_ms_cudnn / time_in_ms);
  437. }
  438. }
  439. #endif
  440. } // namespace conv
  441. } // namespace test
  442. } // namespace megdnn
  443. #undef V1
  444. #undef V