You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise_benchmark.cpp 9.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. #if MEGDNN_WITH_BENCHMARK
  2. #include "test/arm_common/fixture.h"
  3. #include "megdnn/oprs.h"
  4. #include "test/common/benchmarker.h"
  5. #include "test/common/checker.h"
  6. #include "test/common/rng.h"
  7. using namespace megdnn;
  8. using namespace test;
  9. #define TEST_IN_DIFF_DISTRUBUTION(proportion_of_inf, dataset_number) \
  10. max_val = 88.3762626647949f / (1 - proportion_of_inf); \
  11. UniformFloatRNG rng##dataset_number(0.f, max_val); \
  12. B.set_rng(0, &rng##dataset_number); \
  13. B.execs({{355600}, {}});
  14. TEST_F(ARM_COMMON, BENCHMARK_ELEM_UNARY_FLOATONLY) {
  15. Benchmarker<ElemwiseForward> B(handle());
  16. using Mode = ElemwiseForward::Param::Mode;
  17. // UniformFloatWithZeroRNG rng(80, 100, 0.1);
  18. printf("Test Optr exp(x)\n");
  19. B.set_param(Mode::EXP);
  20. B.execs({{355600}, {}});
  21. B.set_param(Mode::EXP);
  22. B.execs({{355600}, {}});
  23. float max_val = 0;
  24. TEST_IN_DIFF_DISTRUBUTION(0.25, 1)
  25. TEST_IN_DIFF_DISTRUBUTION(0.5, 2)
  26. TEST_IN_DIFF_DISTRUBUTION(0.75, 3)
  27. TEST_IN_DIFF_DISTRUBUTION(0.9999, 4)
  28. printf("Test Optr tanh(x)\n");
  29. B.set_param(Mode::TANH);
  30. B.execs({{355600}, {}});
  31. B.set_param(Mode::TANH);
  32. B.execs({{355600}, {}});
  33. max_val = 0;
  34. TEST_IN_DIFF_DISTRUBUTION(0.25, 5)
  35. TEST_IN_DIFF_DISTRUBUTION(0.5, 6)
  36. TEST_IN_DIFF_DISTRUBUTION(0.75, 7)
  37. TEST_IN_DIFF_DISTRUBUTION(0.9999, 8)
  38. printf("Test Optr fast_tanh(x)\n");
  39. B.set_param(Mode::FAST_TANH);
  40. B.execs({{355600}, {}});
  41. printf("Test Optr sigmoid(x)\n");
  42. B.set_param(Mode::SIGMOID);
  43. B.execs({{355600}, {}});
  44. TEST_IN_DIFF_DISTRUBUTION(0.25, 9)
  45. TEST_IN_DIFF_DISTRUBUTION(0.5, 10)
  46. TEST_IN_DIFF_DISTRUBUTION(0.75, 11)
  47. TEST_IN_DIFF_DISTRUBUTION(0.9999, 12)
  48. B.set_param(Mode::SIGMOID);
  49. B.execs({{355600}, {}});
  50. max_val = 0;
  51. TEST_IN_DIFF_DISTRUBUTION(0.25, 13)
  52. TEST_IN_DIFF_DISTRUBUTION(0.5, 14)
  53. TEST_IN_DIFF_DISTRUBUTION(0.75, 15)
  54. TEST_IN_DIFF_DISTRUBUTION(0.9999, 16)
  55. }
  56. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_UNARY) {
  57. Benchmarker<ElemwiseForward> B(handle());
  58. using Mode = ElemwiseForward::Param::Mode;
  59. const size_t RUN_TIMES = 10;
  60. B.set_times(RUN_TIMES).set_display(false);
  61. auto run_unary = [&](const TensorShape& shape, param::Elemwise::Mode mode,
  62. const char* mode_str, DType dtype) {
  63. B.set_param(mode).set_dtype(0, dtype);
  64. float time = B.execs({shape, {}}) / RUN_TIMES;
  65. float computations = shape.total_nr_elems() * 2 / (1024.f * 1024.f * 1024.f);
  66. printf("%s(%s):\tlayout(%s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
  67. dtype.name(), shape.to_string().c_str(), time,
  68. computations * dtype.size() / time * 1e3);
  69. };
  70. #define RUN(shape, mode, dtype) run_unary(shape, mode, #mode, dtype);
  71. #define BENCHMARK_CASES_INT(shape, dtype) \
  72. RUN(shape, Mode::RELU, dtype) \
  73. RUN(shape, Mode::ABS, dtype)
  74. #define BENCHMARK_CASES_FLOAT(shape, dtype) \
  75. BENCHMARK_CASES_INT(shape, dtype) \
  76. RUN(shape, Mode::SIGMOID, dtype) \
  77. RUN(shape, Mode::EXP, dtype) \
  78. RUN(shape, Mode::TANH, dtype) \
  79. RUN(shape, Mode::FAST_TANH, dtype)
  80. TensorShape shape = {10, 50, 10, 100};
  81. BENCHMARK_CASES_INT(shape, dtype::Int32());
  82. BENCHMARK_CASES_INT(shape, dtype::Int16());
  83. BENCHMARK_CASES_INT(shape, dtype::Int8());
  84. BENCHMARK_CASES_FLOAT(shape, dtype::Float32());
  85. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  86. BENCHMARK_CASES_FLOAT(shape, dtype::Float16());
  87. #endif
  88. #undef BENCHMARK_CASES_INT
  89. #undef BENCHMARK_CASES_FLOAT
  90. #undef RUN
  91. }
  92. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_UNARY_MULTI_TYPE) {
  93. Benchmarker<ElemwiseMultiType> B(handle());
  94. using Mode = ElemwiseMultiType::Param::Mode;
  95. const size_t RUN_TIMES = 20;
  96. B.set_times(RUN_TIMES).set_display(false);
  97. auto run_unary = [&](const TensorShape& shape, Mode mode, const char* mode_str,
  98. DType src_dtype, DType dst_dtype) {
  99. B.set_param(mode).set_dtype(0, src_dtype).set_dtype(1, dst_dtype);
  100. float time = B.execs({shape, {}}) / RUN_TIMES;
  101. float computations = shape.total_nr_elems() * 2 / (1024.f * 1024.f * 1024.f);
  102. printf("type %s %s(%s) to %s \ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
  103. src_dtype.name(), shape.to_string().c_str(), dst_dtype.name(), time,
  104. computations * src_dtype.size() / time * 1e3);
  105. };
  106. #define RUN(shape, mode, src_dtype, dst_dtye) \
  107. run_unary(shape, mode, #mode, src_dtype, dst_dtye);
  108. #define BENCHMARK_CASES_INT(shape, src_dtype, dst_dtye) \
  109. RUN(shape, Mode::QRELU, src_dtype, dst_dtye) \
  110. RUN(shape, Mode::QABS, src_dtype, dst_dtye)
  111. TensorShape shape = {10, 50, 10, 100};
  112. BENCHMARK_CASES_INT(shape, dtype::QuantizedS32(62.5f), dtype::QuantizedS8(2.5f));
  113. #undef BENCHMARK_CASES_INT
  114. #undef BENCHMARK_CASES_FLOAT
  115. #undef RUN
  116. }
  117. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_BINARY) {
  118. Benchmarker<ElemwiseForward> B(handle());
  119. using Mode = ElemwiseForward::Param::Mode;
  120. const size_t RUN_TIMES = 10;
  121. B.set_times(RUN_TIMES).set_display(false);
  122. auto run_binary = [&](const TensorShape& shape0, const TensorShape& shape1,
  123. param::Elemwise::Mode mode, const char* mode_str,
  124. DType dtype) {
  125. B.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype);
  126. float time = B.execs({shape0, shape1, {}}) / RUN_TIMES;
  127. float bandwidth = (shape0.total_nr_elems() + shape1.total_nr_elems() +
  128. std::max(shape0.total_nr_elems(), shape1.total_nr_elems())) /
  129. (1024.f * 1024.f * 1024.f) * dtype.size() / time * 1e3;
  130. printf("%s(%s):\tlayout(%s %s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
  131. dtype.name(), shape0.to_string().c_str(), shape1.to_string().c_str(),
  132. time, bandwidth);
  133. };
  134. #define RUN(shape0, shape1, mode, dtype) run_binary(shape0, shape1, mode, #mode, dtype);
  135. #define BENCHMARK_CASES_INT(shape0, shape1, dtype) \
  136. RUN(shape0, shape1, Mode::ADD, dtype) \
  137. RUN(shape0, shape1, Mode::MIN, dtype) \
  138. RUN(shape0, shape1, Mode::MAX, dtype) \
  139. RUN(shape0, shape1, Mode::SUB, dtype) \
  140. RUN(shape0, shape1, Mode::MUL, dtype) \
  141. RUN(shape0, shape1, Mode::FUSE_ADD_RELU, dtype)
  142. #define BENCHMARK_CASES_FLOAT(shape0, shape1, dtype) \
  143. BENCHMARK_CASES_INT(shape0, shape1, dtype) \
  144. RUN(shape0, shape1, Mode::TRUE_DIV, dtype) \
  145. RUN(shape0, shape1, Mode::FUSE_ADD_SIGMOID, dtype) \
  146. RUN(shape0, shape1, Mode::FUSE_ADD_TANH, dtype)
  147. #define BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1) \
  148. BENCHMARK_CASES_INT(shape0, shape1, dtype::Int32()); \
  149. BENCHMARK_CASES_INT(shape0, shape1, dtype::Int16()); \
  150. BENCHMARK_CASES_INT(shape0, shape1, dtype::Int8()); \
  151. BENCHMARK_CASES_FLOAT(shape0, shape1, dtype::Float32());
  152. TensorShape shape0 = {10, 50, 10, 100};
  153. TensorShape shape1 = {10, 50, 10, 100};
  154. BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
  155. shape1 = {1, 50, 1, 1};
  156. BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
  157. shape1 = {1, 1, 1, 1};
  158. BENCHMARK_CASES_EVERY_DTYPE(shape0, shape1);
  159. #undef BENCHMARK_CASES_EVERY_DTYPE
  160. #undef BENCHMARK_CASES_FLOAT
  161. #undef BENCHMARK_CASES_INT
  162. #undef RUN
  163. }
  164. TEST_F(ARM_COMMON, BENCHMARK_ELEMWISE_TERNARY) {
  165. Benchmarker<ElemwiseForward> B(handle());
  166. using Mode = ElemwiseForward::Param::Mode;
  167. const size_t RUN_TIMES = 10;
  168. B.set_times(RUN_TIMES).set_display(false);
  169. auto run_ternary = [&](const TensorShape& shape0, const TensorShape& shape1,
  170. const TensorShape& shape2, param::Elemwise::Mode mode,
  171. const char* mode_str, DType dtype) {
  172. B.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype).set_dtype(2, dtype);
  173. float time = B.execs({shape0, shape1, shape2, {}}) / RUN_TIMES;
  174. float bandwidth = (shape0.total_nr_elems() * 2 + shape1.total_nr_elems() +
  175. shape2.total_nr_elems()) /
  176. (1024.f * 1024.f * 1024.f) * dtype.size() / time * 1e3;
  177. printf("%s(%s):\tlayout(%s %s %s)\ttime(%fms)\tbandwidth(%fGBps)\n", mode_str,
  178. dtype.name(), shape0.to_string().c_str(), shape1.to_string().c_str(),
  179. shape2.to_string().c_str(), time, bandwidth);
  180. };
  181. TensorShape shape = {10, 50, 10, 100};
  182. run_ternary(
  183. shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int32());
  184. run_ternary(
  185. shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int16());
  186. run_ternary(
  187. shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Int8());
  188. run_ternary(
  189. shape, shape, shape, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3",
  190. dtype::Float32());
  191. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  192. run_ternary(
  193. shape, {1}, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float32());
  194. run_ternary(
  195. shape, {1}, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float16());
  196. run_ternary(
  197. {1}, shape, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float32());
  198. run_ternary(
  199. {1}, shape, {1}, Mode::FUSE_MUL_ADD3, "FUSE_MUL_ADD3", dtype::Float16());
  200. #endif
  201. }
  202. #endif