You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

benchmark.cpp 8.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. /**
  2. * \file dnn/test/cuda/benchmark.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/cuda/fixture.h"
  12. #include "test/common/tensor.h"
  13. #include "test/common/timer.h"
  14. #include "megdnn/oprs.h"
  15. #include "test/common/workspace_wrapper.h"
  16. #include "test/common/benchmarker.h"
  17. #include "src/cuda/utils.h"
  18. namespace megdnn {
  19. namespace test {
  20. #if MEGDNN_WITH_BENCHMARK
  21. TEST_F(CUDA, BENCHMARK_CONVOLUTION_8X8X32)
  22. {
  23. if (!cuda::is_compute_capability_required(6, 1)) {
  24. printf("Skip CUDA.BENCHMARK_CONVOLUTION_8X8X32 test as current device"
  25. "doesn't support\n");
  26. return;
  27. }
  28. using Param = param::Convolution;
  29. auto run_1x1 = [&](size_t N, size_t OC, size_t IC, size_t H, size_t W) {
  30. Benchmarker<Convolution> benchmarker(handle_cuda());
  31. Param param_base;
  32. Param param_float = param_base, param_int = param_base;
  33. param_int.format = Param::Format::NHWC;
  34. TensorShape src_float{N, IC, H, W}, filter_float{OC, IC, 1, 1};
  35. TensorShape src_int{N, H, W, IC}, filter_int{OC, 1, 1, IC};
  36. benchmarker.set_display(false);
  37. auto time_in_ms_float = benchmarker.set_param(param_float)
  38. .set_dtype(0, dtype::Float32())
  39. .set_dtype(1, dtype::Float32())
  40. .set_dtype(2, dtype::Float32())
  41. .execs({src_float, filter_float, {}});
  42. auto time_in_ms_int = benchmarker.set_param(param_int)
  43. .set_dtype(0, dtype::Int8())
  44. .set_dtype(1, dtype::Int8())
  45. .set_dtype(2, dtype::Int32())
  46. .execs({src_int, filter_int, {}});
  47. std::cout << "1x1: N=" << N << " OC=" << OC << " IC=" << IC
  48. << " H=" << H << " W=" << W
  49. << " time_float=" << time_in_ms_float << "ms"
  50. << " time_int=" << time_in_ms_int << "ms" << std::endl;
  51. };
  52. auto run_chanwise = [&](size_t N, size_t C, size_t H, size_t W,
  53. size_t F) {
  54. size_t P = F/2;
  55. Benchmarker<Convolution> benchmarker(handle_cuda());
  56. Param param_base;
  57. param_base.pad_h = param_base.pad_w = P;
  58. param_base.sparse = Param::Sparse::GROUP;
  59. Param param_float = param_base;
  60. Param param_int = param_base;
  61. param_int.format = Param::Format::NHWC;
  62. TensorShape src_float{N, C, H, W}, filter_float{C, 1, 1, F, F};
  63. TensorShape src_int{N, H, W, C}, filter_int{C, 1, F, F, 1};
  64. benchmarker.set_display(false);
  65. auto time_in_ms_float = benchmarker.set_param(param_float)
  66. .set_dtype(0, dtype::Float32())
  67. .set_dtype(1, dtype::Float32())
  68. .set_dtype(2, dtype::Float32())
  69. .execs({src_float, filter_float, {}});
  70. auto time_in_ms_int = benchmarker.set_param(param_int)
  71. .set_dtype(0, dtype::Int8())
  72. .set_dtype(1, dtype::Int8())
  73. .set_dtype(2, dtype::Int32())
  74. .execs({src_int, filter_int, {}});
  75. std::cout << "chanwise: N=" << N << " C=" << C
  76. << " H=" << H << " W=" << W << " F=" << F
  77. << " time_float=" << time_in_ms_float << "ms"
  78. << " time_int=" << time_in_ms_int << "ms" << std::endl;
  79. };
  80. run_chanwise(1, 384, 56, 56, 3);
  81. run_1x1(1, 32, 32, 56, 56);
  82. run_1x1(1, 256, 256, 7, 7);
  83. }
  84. TEST_F(CUDA, BENCHMARK_REDUCE)
  85. {
  86. auto run = [&](size_t A, size_t B, size_t C) {
  87. Tensor<> src(handle_cuda(), TensorLayout({A, B, C}, dtype::Float32())),
  88. dst(handle_cuda(), TensorLayout({A, 1, C}, dtype::Float32()));
  89. auto opr = handle_cuda()->create_operator<Reduce>();
  90. opr->param().axis = 1;
  91. WorkspaceWrapper workspace(handle_cuda(), opr->get_workspace_in_bytes(
  92. src.layout(), dst.layout()));
  93. opr->exec(src.tensornd(), dst.tensornd(), workspace.workspace());
  94. Timer timer;
  95. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  96. timer.start();
  97. for (size_t i = 0; i < 10; ++i)
  98. opr->exec(src.tensornd(), dst.tensornd(), workspace.workspace());
  99. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  100. timer.stop();
  101. float time_in_us = timer.get_time_in_us();
  102. std::cout << "src = " << A << "x" << B << "x" << C << std::endl
  103. << "time = " << time_in_us / 1e3 << "ms" << std::endl;
  104. };
  105. run(65536, 64, 1);
  106. run(1, 268435455, 1);
  107. run(256, 1048575, 1);
  108. run(1, 1048575, 256);
  109. run(256, 4095, 256);
  110. }
  111. TEST_F(CUDA, BENCHMARK_BATCHED_MATRIX_MUL)
  112. {
  113. auto run = [&](size_t b, size_t m, size_t n, size_t k) {
  114. Tensor<> A(handle_cuda(), TensorLayout({b, m, k}, dtype::Float32()));
  115. Tensor<> B(handle_cuda(), TensorLayout({b, k, n}, dtype::Float32()));
  116. Tensor<> C(handle_cuda(), TensorLayout({b, m, n}, dtype::Float32()));
  117. auto opr = handle_cuda()->create_operator<BatchedMatrixMul>();
  118. WorkspaceWrapper workspace(handle_cuda(), opr->get_workspace_in_bytes(
  119. A.layout(), B.layout(), C.layout()));
  120. opr->exec(A.tensornd(), B.tensornd(), C.tensornd(),
  121. workspace.workspace());
  122. Timer timer;
  123. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  124. timer.start();
  125. opr->exec(A.tensornd(), B.tensornd(), C.tensornd(),
  126. workspace.workspace());
  127. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  128. timer.stop();
  129. float time_in_s = timer.get_time_in_us() / 1e6;
  130. float flo = b*m*n*k*2;
  131. float gflops = flo / time_in_s / 1e9;
  132. std::cout << "time_in_s = " << time_in_s << '\n'
  133. << "flo = " << flo << '\n'
  134. << "gflops = " << gflops << std::endl;
  135. };
  136. run(256, 256, 256, 256);
  137. }
  138. TEST_F(CUDA, BENCHMARK_MATRIX_MUL)
  139. {
  140. auto run = [&](size_t m, size_t n, size_t k) {
  141. Tensor<> A(handle_cuda(), TensorLayout({m, k}, dtype::Float32()));
  142. Tensor<> B(handle_cuda(), TensorLayout({k, n}, dtype::Float32()));
  143. Tensor<> C(handle_cuda(), TensorLayout({m, n}, dtype::Float32()));
  144. auto opr = handle_cuda()->create_operator<MatrixMul>();
  145. WorkspaceWrapper workspace(handle_cuda(), opr->get_workspace_in_bytes(
  146. A.layout(), B.layout(), C.layout()));
  147. opr->exec(A.tensornd(), B.tensornd(), C.tensornd(),
  148. workspace.workspace());
  149. Timer timer;
  150. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  151. timer.start();
  152. opr->exec(A.tensornd(), B.tensornd(), C.tensornd(),
  153. workspace.workspace());
  154. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  155. timer.stop();
  156. float time_in_s = timer.get_time_in_us() / 1e6;
  157. float flo = m*n*k*2;
  158. float gflops = flo / time_in_s / 1e9;
  159. std::cout << "time_in_s = " << time_in_s << '\n'
  160. << "flo = " << flo << '\n'
  161. << "gflops = " << gflops << std::endl;
  162. };
  163. run(4096, 4096, 4096);
  164. }
  165. TEST_F(CUDA, BENCHMARK_LOCAL)
  166. {
  167. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW,
  168. size_t OC, size_t OH, size_t OW, size_t FH, size_t FW) {
  169. Tensor<> src(handle_cuda(), TensorLayout({N, IC, IH, IW},
  170. dtype::Float32()));
  171. Tensor<> filter(handle_cuda(), TensorLayout({OH, OW, IC, FH, FW, OC},
  172. dtype::Float32()));
  173. Tensor<> dst(handle_cuda(), TensorLayout({N, OC, OH, OW},
  174. dtype::Float32()));
  175. auto opr = handle_cuda()->create_operator<Local>();
  176. WorkspaceWrapper workspace(handle_cuda(), opr->get_workspace_in_bytes(
  177. src.layout(), filter.layout(), dst.layout()));
  178. opr->exec(src.tensornd(), filter.tensornd(), dst.tensornd(),
  179. workspace.workspace());
  180. Timer timer;
  181. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  182. timer.start();
  183. opr->exec(src.tensornd(), filter.tensornd(), dst.tensornd(),
  184. workspace.workspace());
  185. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  186. timer.stop();
  187. float time_in_us = timer.get_time_in_us();
  188. std::cout << "time = " << time_in_us << "us" << std::endl;
  189. };
  190. run(32, 64, 7, 7, 64, 5, 5, 3, 3);
  191. }
  192. #endif
  193. } // namespace test
  194. } // namespace megdnn
  195. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台