You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise_benchmark.cpp 9.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. /**
  2. * \file dnn/test/arm_common/elemwise_benchmark.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #if MEGDNN_WITH_BENCHMARK
  13. #include "test/arm_common/fixture.h"
  14. #include "megdnn/oprs.h"
  15. #include "test/common/benchmarker.h"
  16. #include "test/common/checker.h"
  17. #include "test/common/rng.h"
  18. using namespace megdnn;
  19. using namespace test;
  20. #define TEST_IN_DIFF_DISTRUBUTION(proportion_of_inf, dataset_number) \
  21. max_val = 88.3762626647949f / (1 - proportion_of_inf); \
  22. UniformFloatRNG rng##dataset_number(0.f, max_val); \
  23. B.set_rng(0, &rng##dataset_number); \
  24. B.execs({{355600}, {}});
  25. TEST_F(ARM_COMMON, BENCHMARK_ELEM_UNARY_FLOATONLY) {
  26. Benchmarker<ElemwiseForward> B(handle());
  27. using Mode = ElemwiseForward::Param::Mode;
  28. // UniformFloatWithZeroRNG rng(80, 100, 0.1);
  29. printf("Test Optr exp(x)\n");
  30. B.set_param(Mode::EXP);
  31. B.execs({{355600}, {}});
  32. B.set_param(Mode::EXP);
  33. B.execs({{355600}, {}});
  34. float max_val = 0;
  35. TEST_IN_DIFF_DISTRUBUTION(0.25, 1)
  36. TEST_IN_DIFF_DISTRUBUTION(0.5, 2)
  37. TEST_IN_DIFF_DISTRUBUTION(0.75, 3)
  38. TEST_IN_DIFF_DISTRUBUTION(0.9999, 4)
  39. printf("Test Optr tanh(x)\n");
  40. B.set_param(Mode::TANH);
  41. B.execs({{355600}, {}});
  42. B.set_param(Mode::TANH);
  43. B.execs({{355600}, {}});
  44. max_val = 0;
  45. TEST_IN_DIFF_DISTRUBUTION(0.25, 5)
  46. TEST_IN_DIFF_DISTRUBUTION(0.5, 6)
  47. TEST_IN_DIFF_DISTRUBUTION(0.75, 7)
  48. TEST_IN_DIFF_DISTRUBUTION(0.9999, 8)
  49. printf("Test Optr fast_tanh(x)\n");
  50. B.set_param(Mode::FAST_TANH);
  51. B.execs({{355600}, {}});
  52. printf("Test Optr sigmoid(x)\n");
  53. B.set_param(Mode::SIGMOID);
  54. B.execs({{355600}, {}});
  55. TEST_IN_DIFF_DISTRUBUTION(0.25, 9)
  56. TEST_IN_DIFF_DISTRUBUTION(0.5, 10)
  57. TEST_IN_DIFF_DISTRUBUTION(0.75, 11)
  58. TEST_IN_DIFF_DISTRUBUTION(0.9999, 12)
  59. B.set_param(Mode::SIGMOID);
  60. B.execs({{355600}, {}});
  61. max_val = 0;
  62. TEST_IN_DIFF_DISTRUBUTION(0.25, 13)
  63. TEST_IN_DIFF_DISTRUBUTION(0.5, 14)
  64. TEST_IN_DIFF_DISTRUBUTION(0.75, 15)
  65. TEST_IN_DIFF_DISTRUBUTION(0.9999, 16)
  66. }
  67. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_UNARY) {
  68. Benchmarker<ElemwiseForward> B(handle());
  69. using Mode = ElemwiseForward::Param::Mode;
  70. const size_t RUN_TIMES = 10;
  71. B.set_times(RUN_TIMES).set_display(false);
  72. auto run_unary = [&](const TensorShape& shape, param::Elemwise::Mode mode,
  73. const char* mode_str, DType dtype) {
  74. B.set_param(mode).set_dtype(0, dtype);
  75. float time = B.execs({shape, {}}) / RUN_TIMES;
  76. float computations =
  77. shape.total_nr_elems() * 2 / (1024.f * 1024.f * 1024.f);
  78. printf("%s(%s):\tlayout(%s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
  79. dtype.name(), shape.to_string().c_str(), time,
  80. computations * dtype.size() / time * 1e3);
  81. };
  82. #define RUN(shape, mode, dtype) run_unary(shape, mode, #mode, dtype);
  83. #define BENCHMARK_CASES_INT(shape, dtype) \
  84. RUN(shape, Mode::RELU, dtype) \
  85. RUN(shape, Mode::ABS, dtype)
  86. #define BENCHMARK_CASES_FLOAT(shape, dtype) \
  87. BENCHMARK_CASES_INT(shape, dtype) \
  88. RUN(shape, Mode::SIGMOID, dtype) \
  89. RUN(shape, Mode::EXP, dtype) \
  90. RUN(shape, Mode::TANH, dtype) \
  91. RUN(shape, Mode::FAST_TANH, dtype)
  92. TensorShape shape = {10, 50, 10, 100};
  93. BENCHMARK_CASES_INT(shape, dtype::Int32());
  94. BENCHMARK_CASES_INT(shape, dtype::Int16());
  95. BENCHMARK_CASES_INT(shape, dtype::Int8());
  96. BENCHMARK_CASES_FLOAT(shape, dtype::Float32());
  97. #undef BENCHMARK_CASES_INT
  98. #undef BENCHMARK_CASES_FLOAT
  99. #undef RUN
  100. }
  101. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_UNARY_MULTI_TYPE) {
  102. Benchmarker<ElemwiseMultiType> B(handle());
  103. using Mode = ElemwiseMultiType::Param::Mode;
  104. const size_t RUN_TIMES = 20;
  105. B.set_times(RUN_TIMES).set_display(false);
  106. auto run_unary = [&](const TensorShape& shape, Mode mode,
  107. const char* mode_str, DType src_dtype,
  108. DType dst_dtype) {
  109. B.set_param(mode).set_dtype(0, src_dtype).set_dtype(1, dst_dtype);
  110. float time = B.execs({shape, {}}) / RUN_TIMES;
  111. float computations =
  112. shape.total_nr_elems() * 2 / (1024.f * 1024.f * 1024.f);
  113. printf("type %s %s(%s) to %s \ttime(%fms)\tbandwidth(%fGBps)\n",
  114. mode_str, src_dtype.name(), shape.to_string().c_str(),
  115. dst_dtype.name(), time,
  116. computations * src_dtype.size() / time * 1e3);
  117. };
  118. #define RUN(shape, mode, src_dtype, dst_dtye) \
  119. run_unary(shape, mode, #mode, src_dtype, dst_dtye);
  120. #define BENCHMARK_CASES_INT(shape, src_dtype, dst_dtye) \
  121. RUN(shape, Mode::QRELU, src_dtype, dst_dtye) \
  122. RUN(shape, Mode::QABS, src_dtype, dst_dtye)
  123. TensorShape shape = {10, 50, 10, 100};
  124. BENCHMARK_CASES_INT(shape, dtype::QuantizedS32(62.5f),
  125. dtype::QuantizedS8(2.5f));
  126. #undef BENCHMARK_CASES_INT
  127. #undef BENCHMARK_CASES_FLOAT
  128. #undef RUN
  129. }
  130. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_BINARY) {
  131. Benchmarker<ElemwiseForward> B(handle());
  132. using Mode = ElemwiseForward::Param::Mode;
  133. const size_t RUN_TIMES = 10;
  134. B.set_times(RUN_TIMES).set_display(false);
  135. auto run_binary = [&](const TensorShape& shape0, const TensorShape& shape1,
  136. param::Elemwise::Mode mode, const char* mode_str,
  137. DType dtype) {
  138. B.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype);
  139. float time = B.execs({shape0, shape1, {}}) / RUN_TIMES;
  140. float bandwidth =
  141. (shape0.total_nr_elems() + shape1.total_nr_elems() +
  142. std::max(shape0.total_nr_elems(), shape1.total_nr_elems())) /
  143. (1024.f * 1024.f * 1024.f) * dtype.size() / time * 1e3;
  144. printf("%s(%s):\tlayout(%s %s)\ttime(%fms)\tbandwidth(%fGBps)\n",
  145. mode_str, dtype.name(), shape0.to_string().c_str(),
  146. shape1.to_string().c_str(), time, bandwidth);
  147. };
  148. #define RUN(shape0, shape1, mode, dtype) \
  149. run_binary(shape0, shape1, mode, #mode, dtype);
  150. #define BENCHMARK_CASES_INT(shape0, shape1, dtype) \
  151. RUN(shape0, shape1, Mode::ADD, dtype) \
  152. RUN(shape0, shape1, Mode::MIN, dtype) \
  153. RUN(shape0, shape1, Mode::MAX, dtype) \
  154. RUN(shape0, shape1, Mode::SUB, dtype) \
  155. RUN(shape0, shape1, Mode::MUL, dtype) \
  156. RUN(shape0, shape1, Mode::FUSE_ADD_RELU, dtype)
  157. #define BENCHMARK_CASES_FLOAT(shape0, shape1, dtype) \
  158. BENCHMARK_CASES_INT(shape0, shape1, dtype) \
  159. RUN(shape0, shape1, Mode::TRUE_DIV, dtype) \
  160. RUN(shape0, shape1, Mode::FUSE_ADD_SIGMOID, dtype) \
  161. RUN(shape0, shape1, Mode::FUSE_ADD_TANH, dtype)
  162. #define BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1) \
  163. BENCHMARK_CASES_INT(shape0, shape1, dtype::Int32()); \
  164. BENCHMARK_CASES_INT(shape0, shape1, dtype::Int16()); \
  165. BENCHMARK_CASES_INT(shape0, shape1, dtype::Int8()); \
  166. BENCHMARK_CASES_FLOAT(shape0, shape1, dtype::Float32());
  167. TensorShape shape0 = {10, 50, 10, 100};
  168. TensorShape shape1 = {10, 50, 10, 100};
  169. BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
  170. shape1 = {1, 50, 1, 1};
  171. BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
  172. shape1 = {1, 1, 1, 1};
  173. BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
  174. #undef BENCHMARK_CASES_EVERY_DTYPE
  175. #undef BENCHMARK_CASES_FLOAT
  176. #undef BENCHMARK_CASES_INT
  177. #undef RUN
  178. }
  179. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_TERNARY) {
  180. Benchmarker<ElemwiseForward> B(handle());
  181. using Mode = ElemwiseForward::Param::Mode;
  182. const size_t RUN_TIMES = 10;
  183. B.set_times(RUN_TIMES).set_display(false);
  184. auto run_ternary = [&](const TensorShape& shape0, const TensorShape& shape1,
  185. const TensorShape& shape2,
  186. param::Elemwise::Mode mode, const char* mode_str,
  187. DType dtype) {
  188. B.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(
  189. 2, dtype);
  190. float time = B.execs({shape0, shape1, shape2, {}}) / RUN_TIMES;
  191. float bandwidth = (shape0.total_nr_elems() * 2 +
  192. shape1.total_nr_elems() + shape2.total_nr_elems()) /
  193. (1024.f * 1024.f * 1024.f) * dtype.size() / time *
  194. 1e3;
  195. printf("%s(%s):\tlayout(%s %s %s)\ttime(%fms)\tbandwidth(%fGBps)\n",
  196. mode_str, dtype.name(), shape0.to_string().c_str(),
  197. shape1.to_string().c_str(), shape2.to_string().c_str(), time,
  198. bandwidth);
  199. };
  200. TensorShape shape = {10, 50, 10, 100};
  201. run_ternary(shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
  202. dtype::Int32());
  203. run_ternary(shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
  204. dtype::Int16());
  205. run_ternary(shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
  206. dtype::Int8());
  207. run_ternary(shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
  208. dtype::Float32());
  209. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  210. run_ternary(shape, {1}, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
  211. dtype::Float32());
  212. run_ternary(shape, {1}, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
  213. dtype::Float16());
  214. run_ternary({1}, shape, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
  215. dtype::Float32());
  216. run_ternary({1}, shape, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
  217. dtype::Float16());
  218. #endif
  219. }
  220. #endif

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台