You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

benchmark.cpp 8.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. #include "test/cuda/fixture.h"
  2. #include "megdnn/oprs.h"
  3. #include "test/common/benchmarker.h"
  4. #include "test/common/tensor.h"
  5. #include "test/common/timer.h"
  6. #include "test/common/workspace_wrapper.h"
  7. #include "test/cuda/utils.h"
  8. namespace megdnn {
  9. namespace test {
  10. #if MEGDNN_WITH_BENCHMARK
  11. TEST_F(CUDA, BENCHMARK_CONVOLUTION_8X8X32) {
  12. require_compute_capability(6, 1);
  13. using Param = param::Convolution;
  14. auto run_1x1 = [&](size_t N, size_t OC, size_t IC, size_t H, size_t W) {
  15. Benchmarker<Convolution> benchmarker(handle_cuda());
  16. Param param_base;
  17. Param param_float = param_base, param_int = param_base;
  18. param_int.format = Param::Format::NHWC;
  19. TensorShape src_float{N, IC, H, W}, filter_float{OC, IC, 1, 1};
  20. TensorShape src_int{N, H, W, IC}, filter_int{OC, 1, 1, IC};
  21. benchmarker.set_display(false);
  22. auto time_in_ms_float = benchmarker.set_param(param_float)
  23. .set_dtype(0, dtype::Float32())
  24. .set_dtype(1, dtype::Float32())
  25. .set_dtype(2, dtype::Float32())
  26. .execs({src_float, filter_float, {}});
  27. auto time_in_ms_int = benchmarker.set_param(param_int)
  28. .set_dtype(0, dtype::Int8())
  29. .set_dtype(1, dtype::Int8())
  30. .set_dtype(2, dtype::Int32())
  31. .execs({src_int, filter_int, {}});
  32. std::cout << "1x1: N=" << N << " OC=" << OC << " IC=" << IC << " H=" << H
  33. << " W=" << W << " time_float=" << time_in_ms_float << "ms"
  34. << " time_int=" << time_in_ms_int << "ms" << std::endl;
  35. };
  36. auto run_chanwise = [&](size_t N, size_t C, size_t H, size_t W, size_t F) {
  37. size_t P = F / 2;
  38. Benchmarker<Convolution> benchmarker(handle_cuda());
  39. Param param_base;
  40. param_base.pad_h = param_base.pad_w = P;
  41. param_base.sparse = Param::Sparse::GROUP;
  42. Param param_float = param_base;
  43. Param param_int = param_base;
  44. param_int.format = Param::Format::NHWC;
  45. TensorShape src_float{N, C, H, W}, filter_float{C, 1, 1, F, F};
  46. TensorShape src_int{N, H, W, C}, filter_int{C, 1, F, F, 1};
  47. benchmarker.set_display(false);
  48. auto time_in_ms_float = benchmarker.set_param(param_float)
  49. .set_dtype(0, dtype::Float32())
  50. .set_dtype(1, dtype::Float32())
  51. .set_dtype(2, dtype::Float32())
  52. .execs({src_float, filter_float, {}});
  53. auto time_in_ms_int = benchmarker.set_param(param_int)
  54. .set_dtype(0, dtype::Int8())
  55. .set_dtype(1, dtype::Int8())
  56. .set_dtype(2, dtype::Int32())
  57. .execs({src_int, filter_int, {}});
  58. std::cout << "chanwise: N=" << N << " C=" << C << " H=" << H << " W=" << W
  59. << " F=" << F << " time_float=" << time_in_ms_float << "ms"
  60. << " time_int=" << time_in_ms_int << "ms" << std::endl;
  61. };
  62. run_chanwise(1, 384, 56, 56, 3);
  63. run_1x1(1, 32, 32, 56, 56);
  64. run_1x1(1, 256, 256, 7, 7);
  65. }
  66. TEST_F(CUDA, BENCHMARK_REDUCE) {
  67. auto run = [&](size_t A, size_t B, size_t C) {
  68. Tensor<> src(handle_cuda(), TensorLayout({A, B, C}, dtype::Float32())),
  69. dst(handle_cuda(), TensorLayout({A, 1, C}, dtype::Float32()));
  70. auto opr = handle_cuda()->create_operator<Reduce>();
  71. opr->param().axis = 1;
  72. WorkspaceWrapper workspace(
  73. handle_cuda(), opr->get_workspace_in_bytes(src.layout(), dst.layout()));
  74. opr->exec(src.tensornd(), dst.tensornd(), workspace.workspace());
  75. Timer timer;
  76. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  77. timer.start();
  78. for (size_t i = 0; i < 10; ++i)
  79. opr->exec(src.tensornd(), dst.tensornd(), workspace.workspace());
  80. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  81. timer.stop();
  82. float time_in_us = timer.get_time_in_us();
  83. std::cout << "src = " << A << "x" << B << "x" << C << std::endl
  84. << "time = " << time_in_us / 1e3 << "ms" << std::endl;
  85. };
  86. run(65536, 64, 1);
  87. run(1, 268435455, 1);
  88. run(256, 1048575, 1);
  89. run(1, 1048575, 256);
  90. run(256, 4095, 256);
  91. }
  92. TEST_F(CUDA, BENCHMARK_BATCHED_MATRIX_MUL) {
  93. auto run = [&](size_t b, size_t m, size_t n, size_t k) {
  94. Tensor<> A(handle_cuda(), TensorLayout({b, m, k}, dtype::Float32()));
  95. Tensor<> B(handle_cuda(), TensorLayout({b, k, n}, dtype::Float32()));
  96. Tensor<> C(handle_cuda(), TensorLayout({b, m, n}, dtype::Float32()));
  97. auto opr = handle_cuda()->create_operator<BatchedMatrixMul>();
  98. WorkspaceWrapper workspace(
  99. handle_cuda(),
  100. opr->get_workspace_in_bytes(A.layout(), B.layout(), C.layout()));
  101. opr->exec(A.tensornd(), B.tensornd(), C.tensornd(), workspace.workspace());
  102. Timer timer;
  103. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  104. timer.start();
  105. opr->exec(A.tensornd(), B.tensornd(), C.tensornd(), workspace.workspace());
  106. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  107. timer.stop();
  108. float time_in_s = timer.get_time_in_us() / 1e6;
  109. float flo = b * m * n * k * 2;
  110. float gflops = flo / time_in_s / 1e9;
  111. std::cout << "time_in_s = " << time_in_s << '\n'
  112. << "flo = " << flo << '\n'
  113. << "gflops = " << gflops << std::endl;
  114. };
  115. run(256, 256, 256, 256);
  116. }
  117. TEST_F(CUDA, BENCHMARK_MATRIX_MUL) {
  118. auto run = [&](size_t m, size_t n, size_t k) {
  119. Tensor<> A(handle_cuda(), TensorLayout({m, k}, dtype::Float32()));
  120. Tensor<> B(handle_cuda(), TensorLayout({k, n}, dtype::Float32()));
  121. Tensor<> C(handle_cuda(), TensorLayout({m, n}, dtype::Float32()));
  122. auto opr = handle_cuda()->create_operator<MatrixMul>();
  123. WorkspaceWrapper workspace(
  124. handle_cuda(),
  125. opr->get_workspace_in_bytes(A.layout(), B.layout(), C.layout()));
  126. opr->exec(A.tensornd(), B.tensornd(), C.tensornd(), workspace.workspace());
  127. Timer timer;
  128. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  129. timer.start();
  130. opr->exec(A.tensornd(), B.tensornd(), C.tensornd(), workspace.workspace());
  131. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  132. timer.stop();
  133. float time_in_s = timer.get_time_in_us() / 1e6;
  134. float flo = m * n * k * 2;
  135. float gflops = flo / time_in_s / 1e9;
  136. std::cout << "time_in_s = " << time_in_s << '\n'
  137. << "flo = " << flo << '\n'
  138. << "gflops = " << gflops << std::endl;
  139. };
  140. run(4096, 4096, 4096);
  141. }
  142. TEST_F(CUDA, BENCHMARK_LOCAL) {
  143. auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t OC, size_t OH,
  144. size_t OW, size_t FH, size_t FW) {
  145. Tensor<> src(handle_cuda(), TensorLayout({N, IC, IH, IW}, dtype::Float32()));
  146. Tensor<> filter(
  147. handle_cuda(),
  148. TensorLayout({OH, OW, IC, FH, FW, OC}, dtype::Float32()));
  149. Tensor<> dst(handle_cuda(), TensorLayout({N, OC, OH, OW}, dtype::Float32()));
  150. auto opr = handle_cuda()->create_operator<Local>();
  151. WorkspaceWrapper workspace(
  152. handle_cuda(), opr->get_workspace_in_bytes(
  153. src.layout(), filter.layout(), dst.layout()));
  154. opr->exec(
  155. src.tensornd(), filter.tensornd(), dst.tensornd(),
  156. workspace.workspace());
  157. Timer timer;
  158. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  159. timer.start();
  160. opr->exec(
  161. src.tensornd(), filter.tensornd(), dst.tensornd(),
  162. workspace.workspace());
  163. megcoreSynchronize(handle_cuda()->megcore_computing_handle());
  164. timer.stop();
  165. float time_in_us = timer.get_time_in_us();
  166. std::cout << "time = " << time_in_us << "us" << std::endl;
  167. };
  168. run(32, 64, 7, 7, 64, 5, 5, 3, 3);
  169. }
  170. #endif
  171. } // namespace test
  172. } // namespace megdnn
  173. // vim: syntax=cpp.doxygen