You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise_benchmark.cpp 9.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. /**
  2. * \file dnn/test/arm_common/elemwise_benchmark.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #if MEGDNN_WITH_BENCHMARK
  13. #include "test/arm_common/fixture.h"
  14. #include "megdnn/oprs.h"
  15. #include "test/common/benchmarker.h"
  16. #include "test/common/checker.h"
  17. #include "test/common/rng.h"
  18. using namespace megdnn;
  19. using namespace test;
  20. #define TEST_IN_DIFF_DISTRUBUTION(proportion_of_inf, dataset_number) \
  21. max_val = 88.3762626647949f / (1 - proportion_of_inf); \
  22. UniformFloatRNG rng##dataset_number(0.f, max_val); \
  23. B.set_rng(0, &rng##dataset_number); \
  24. B.execs({{355600}, {}});
  25. TEST_F(ARM_COMMON, BENCHMARK_ELEM_UNARY_FLOATONLY) {
  26. Benchmarker<ElemwiseForward> B(handle());
  27. using Mode = ElemwiseForward::Param::Mode;
  28. // UniformFloatWithZeroRNG rng(80, 100, 0.1);
  29. printf("Test Optr exp(x)\n");
  30. B.set_param(Mode::EXP);
  31. B.execs({{355600}, {}});
  32. B.set_param(Mode::EXP);
  33. B.execs({{355600}, {}});
  34. float max_val = 0;
  35. TEST_IN_DIFF_DISTRUBUTION(0.25, 1)
  36. TEST_IN_DIFF_DISTRUBUTION(0.5, 2)
  37. TEST_IN_DIFF_DISTRUBUTION(0.75, 3)
  38. TEST_IN_DIFF_DISTRUBUTION(0.9999, 4)
  39. printf("Test Optr tanh(x)\n");
  40. B.set_param(Mode::TANH);
  41. B.execs({{355600}, {}});
  42. B.set_param(Mode::TANH);
  43. B.execs({{355600}, {}});
  44. max_val = 0;
  45. TEST_IN_DIFF_DISTRUBUTION(0.25, 5)
  46. TEST_IN_DIFF_DISTRUBUTION(0.5, 6)
  47. TEST_IN_DIFF_DISTRUBUTION(0.75, 7)
  48. TEST_IN_DIFF_DISTRUBUTION(0.9999, 8)
  49. printf("Test Optr fast_tanh(x)\n");
  50. B.set_param(Mode::FAST_TANH);
  51. B.execs({{355600}, {}});
  52. printf("Test Optr sigmoid(x)\n");
  53. B.set_param(Mode::SIGMOID);
  54. B.execs({{355600}, {}});
  55. TEST_IN_DIFF_DISTRUBUTION(0.25, 9)
  56. TEST_IN_DIFF_DISTRUBUTION(0.5, 10)
  57. TEST_IN_DIFF_DISTRUBUTION(0.75, 11)
  58. TEST_IN_DIFF_DISTRUBUTION(0.9999, 12)
  59. B.set_param(Mode::SIGMOID);
  60. B.execs({{355600}, {}});
  61. max_val = 0;
  62. TEST_IN_DIFF_DISTRUBUTION(0.25, 13)
  63. TEST_IN_DIFF_DISTRUBUTION(0.5, 14)
  64. TEST_IN_DIFF_DISTRUBUTION(0.75, 15)
  65. TEST_IN_DIFF_DISTRUBUTION(0.9999, 16)
  66. }
  67. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_UNARY) {
  68. Benchmarker<ElemwiseForward> B(handle());
  69. using Mode = ElemwiseForward::Param::Mode;
  70. const size_t RUN_TIMES = 10;
  71. B.set_times(RUN_TIMES).set_display(false);
  72. auto run_unary = [&](const TensorShape& shape, param::Elemwise::Mode mode,
  73. const char* mode_str, DType dtype) {
  74. B.set_param(mode).set_dtype(0, dtype);
  75. float time = B.execs({shape, {}}) / RUN_TIMES;
  76. float computations = shape.total_nr_elems() * 2 / (1024.f * 1024.f * 1024.f);
  77. printf("%s(%s):\tlayout(%s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
  78. dtype.name(), shape.to_string().c_str(), time,
  79. computations * dtype.size() / time * 1e3);
  80. };
  81. #define RUN(shape, mode, dtype) run_unary(shape, mode, #mode, dtype);
  82. #define BENCHMARK_CASES_INT(shape, dtype) \
  83. RUN(shape, Mode::RELU, dtype) \
  84. RUN(shape, Mode::ABS, dtype)
  85. #define BENCHMARK_CASES_FLOAT(shape, dtype) \
  86. BENCHMARK_CASES_INT(shape, dtype) \
  87. RUN(shape, Mode::SIGMOID, dtype) \
  88. RUN(shape, Mode::EXP, dtype) \
  89. RUN(shape, Mode::TANH, dtype) \
  90. RUN(shape, Mode::FAST_TANH, dtype)
  91. TensorShape shape = {10, 50, 10, 100};
  92. BENCHMARK_CASES_INT(shape, dtype::Int32());
  93. BENCHMARK_CASES_INT(shape, dtype::Int16());
  94. BENCHMARK_CASES_INT(shape, dtype::Int8());
  95. BENCHMARK_CASES_FLOAT(shape, dtype::Float32());
  96. #undef BENCHMARK_CASES_INT
  97. #undef BENCHMARK_CASES_FLOAT
  98. #undef RUN
  99. }
  100. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_UNARY_MULTI_TYPE) {
  101. Benchmarker<ElemwiseMultiType> B(handle());
  102. using Mode = ElemwiseMultiType::Param::Mode;
  103. const size_t RUN_TIMES = 20;
  104. B.set_times(RUN_TIMES).set_display(false);
  105. auto run_unary = [&](const TensorShape& shape, Mode mode, const char* mode_str,
  106. DType src_dtype, DType dst_dtype) {
  107. B.set_param(mode).set_dtype(0, src_dtype).set_dtype(1, dst_dtype);
  108. float time = B.execs({shape, {}}) / RUN_TIMES;
  109. float computations = shape.total_nr_elems() * 2 / (1024.f * 1024.f * 1024.f);
  110. printf("type %s %s(%s) to %s \ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
  111. src_dtype.name(), shape.to_string().c_str(), dst_dtype.name(), time,
  112. computations * src_dtype.size() / time * 1e3);
  113. };
  114. #define RUN(shape, mode, src_dtype, dst_dtye) \
  115. run_unary(shape, mode, #mode, src_dtype, dst_dtye);
  116. #define BENCHMARK_CASES_INT(shape, src_dtype, dst_dtye) \
  117. RUN(shape, Mode::QRELU, src_dtype, dst_dtye) \
  118. RUN(shape, Mode::QABS, src_dtype, dst_dtye)
  119. TensorShape shape = {10, 50, 10, 100};
  120. BENCHMARK_CASES_INT(shape, dtype::QuantizedS32(62.5f), dtype::QuantizedS8(2.5f));
  121. #undef BENCHMARK_CASES_INT
  122. #undef BENCHMARK_CASES_FLOAT
  123. #undef RUN
  124. }
  125. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_BINARY) {
  126. Benchmarker<ElemwiseForward> B(handle());
  127. using Mode = ElemwiseForward::Param::Mode;
  128. const size_t RUN_TIMES = 10;
  129. B.set_times(RUN_TIMES).set_display(false);
  130. auto run_binary = [&](const TensorShape& shape0, const TensorShape& shape1,
  131. param::Elemwise::Mode mode, const char* mode_str,
  132. DType dtype) {
  133. B.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype);
  134. float time = B.execs({shape0, shape1, {}}) / RUN_TIMES;
  135. float bandwidth = (shape0.total_nr_elems() + shape1.total_nr_elems() +
  136. std::max(shape0.total_nr_elems(), shape1.total_nr_elems())) /
  137. (1024.f * 1024.f * 1024.f) * dtype.size() / time * 1e3;
  138. printf("%s(%s):\tlayout(%s %s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
  139. dtype.name(), shape0.to_string().c_str(), shape1.to_string().c_str(),
  140. time, bandwidth);
  141. };
  142. #define RUN(shape0, shape1, mode, dtype) run_binary(shape0, shape1, mode, #mode, dtype);
  143. #define BENCHMARK_CASES_INT(shape0, shape1, dtype) \
  144. RUN(shape0, shape1, Mode::ADD, dtype) \
  145. RUN(shape0, shape1, Mode::MIN, dtype) \
  146. RUN(shape0, shape1, Mode::MAX, dtype) \
  147. RUN(shape0, shape1, Mode::SUB, dtype) \
  148. RUN(shape0, shape1, Mode::MUL, dtype) \
  149. RUN(shape0, shape1, Mode::FUSE_ADD_RELU, dtype)
  150. #define BENCHMARK_CASES_FLOAT(shape0, shape1, dtype) \
  151. BENCHMARK_CASES_INT(shape0, shape1, dtype) \
  152. RUN(shape0, shape1, Mode::TRUE_DIV, dtype) \
  153. RUN(shape0, shape1, Mode::FUSE_ADD_SIGMOID, dtype) \
  154. RUN(shape0, shape1, Mode::FUSE_ADD_TANH, dtype)
  155. #define BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1) \
  156. BENCHMARK_CASES_INT(shape0, shape1, dtype::Int32()); \
  157. BENCHMARK_CASES_INT(shape0, shape1, dtype::Int16()); \
  158. BENCHMARK_CASES_INT(shape0, shape1, dtype::Int8()); \
  159. BENCHMARK_CASES_FLOAT(shape0, shape1, dtype::Float32());
  160. TensorShape shape0 = {10, 50, 10, 100};
  161. TensorShape shape1 = {10, 50, 10, 100};
  162. BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
  163. shape1 = {1, 50, 1, 1};
  164. BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
  165. shape1 = {1, 1, 1, 1};
  166. BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
  167. #undef BENCHMARK_CASES_EVERY_DTYPE
  168. #undef BENCHMARK_CASES_FLOAT
  169. #undef BENCHMARK_CASES_INT
  170. #undef RUN
  171. }
  172. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_TERNARY) {
  173. Benchmarker<ElemwiseForward> B(handle());
  174. using Mode = ElemwiseForward::Param::Mode;
  175. const size_t RUN_TIMES = 10;
  176. B.set_times(RUN_TIMES).set_display(false);
  177. auto run_ternary = [&](const TensorShape& shape0, const TensorShape& shape1,
  178. const TensorShape& shape2, param::Elemwise::Mode mode,
  179. const char* mode_str, DType dtype) {
  180. B.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  181. float time = B.execs({shape0, shape1, shape2, {}}) / RUN_TIMES;
  182. float bandwidth = (shape0.total_nr_elems() * 2 + shape1.total_nr_elems() +
  183. shape2.total_nr_elems()) /
  184. (1024.f * 1024.f * 1024.f) * dtype.size() / time * 1e3;
  185. printf("%s(%s):\tlayout(%s %s %s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
  186. dtype.name(), shape0.to_string().c_str(), shape1.to_string().c_str(),
  187. shape2.to_string().c_str(), time, bandwidth);
  188. };
  189. TensorShape shape = {10, 50, 10, 100};
  190. run_ternary(
  191. shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int32());
  192. run_ternary(
  193. shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int16());
  194. run_ternary(
  195. shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int8());
  196. run_ternary(
  197. shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
  198. dtype::Float32());
  199. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  200. run_ternary(
  201. shape, {1}, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float32());
  202. run_ternary(
  203. shape, {1}, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float16());
  204. run_ternary(
  205. {1}, shape, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float32());
  206. run_ternary(
  207. {1}, shape, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float16());
  208. #endif
  209. }
  210. #endif

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台