You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

elemwise.cpp 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. /**
  2. * \file dnn/test/arm_common/elemwise.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/common/elemwise.h"
  13. #include "test/arm_common/fixture.h"
  14. #include "test/common/benchmarker.h"
  15. #include "test/common/checker.h"
  16. #include "megdnn/oprs/general.h"
  17. using namespace megdnn;
  18. using namespace test;
  19. template <typename tag>
  20. class ARM_ELEMWISE : public ARM_COMMON {};
  21. TYPED_TEST_CASE(ARM_ELEMWISE, elemwise::test_types);
  22. TYPED_TEST(ARM_ELEMWISE, run) {
  23. elemwise::run_test<TypeParam>(this->handle());
  24. }
  25. template <typename tag>
  26. class ARM_ELEMWISE_MULTI_THREADS : public ARM_COMMON_MULTI_THREADS {};
  27. TYPED_TEST_CASE(ARM_ELEMWISE_MULTI_THREADS, elemwise::test_types);
  28. TYPED_TEST(ARM_ELEMWISE_MULTI_THREADS, run) {
  29. elemwise::run_test<TypeParam>(this->handle());
  30. }
  31. TEST_F(ARM_COMMON, ELEMWISE_FORWARD_TERNARY) {
  32. using Mode = ElemwiseForward::Param::Mode;
  33. Checker<ElemwiseForward> checker(handle());
  34. checker.set_param(Mode::FUSE_MUL_ADD3);
  35. auto run = [&] {
  36. //! nchw44
  37. checker.execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  38. checker.execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  39. checker.execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  40. checker.execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  41. checker.execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  42. //! nchw44
  43. checker.execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  44. checker.execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  45. checker.execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  46. checker.execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  47. checker.execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  48. checker.execs({{3, 4, 7}, {3, 4, 7}, {3, 4, 7}, {}});
  49. checker.execs({{1, 4, 1, 1}, {3, 4, 5, 7}, {1, 4, 1, 1}, {}});
  50. checker.execs({{1, 4, 1}, {3, 4, 7}, {1, 4, 1}, {}});
  51. checker.execs({{3, 4, 5, 7}, {3, 4, 5, 7}, {1, 1, 1, 1}, {}});
  52. checker.execs({{1, 7}, {1, 7}, {1, 7}, {}});
  53. checker.execs({{1, 2, 1}, {1, 2, 2}, {1, 2, 1}, {}});
  54. checker.execs({{1, 2, 2}, {1, 2, 2}, {1, 1, 1}, {}});
  55. checker.execs({{3, 4, 1}, {3, 4, 1}, {3, 4, 1}, {}});
  56. checker.execs({{3, 4, 5}, {1}, {1}, {}});
  57. checker.execs({{1}, {3, 4, 5}, {1}, {}});
  58. };
  59. // case int
  60. checker.set_dtype(0, dtype::Int8());
  61. checker.set_dtype(1, dtype::Int8());
  62. checker.set_dtype(2, dtype::Int8());
  63. run();
  64. checker.set_dtype(0, dtype::Int16());
  65. checker.set_dtype(1, dtype::Int16());
  66. checker.set_dtype(2, dtype::Int16());
  67. run();
  68. checker.set_dtype(0, dtype::Int32());
  69. checker.set_dtype(1, dtype::Int32());
  70. checker.set_dtype(2, dtype::Int32());
  71. run();
  72. // case float
  73. UniformFloatRNG rng(1e-5, 7e1);
  74. checker.set_rng(0, &rng);
  75. checker.set_epsilon(1e-5);
  76. checker.set_dtype(0, dtype::Float32());
  77. checker.set_dtype(1, dtype::Float32());
  78. checker.set_dtype(2, dtype::Float32());
  79. run();
  80. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  81. // case half
  82. UniformFloatRNG rng_float16(1, 10);
  83. checker.set_rng(0, &rng_float16);
  84. checker.set_epsilon(1e-2);
  85. checker.set_dtype(0, dtype::Float16());
  86. checker.set_dtype(1, dtype::Float16());
  87. checker.set_dtype(2, dtype::Float16());
  88. run();
  89. #endif
  90. }
  91. TEST_F(ARM_COMMON, ELEMWISE_FORWARD_NCHW44_INT8_INT16_INT32) {
  92. using Mode = ElemwiseForward::Param::Mode;
  93. Checker<ElemwiseForward> checker(handle());
  94. auto run = [&]() {
  95. // VEC_BCAST101x not PowOp
  96. checker.set_param(Mode::ADD).execs(
  97. {{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  98. checker.set_param(Mode::ADD).execs(
  99. {{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  100. checker.set_param(Mode::ADD).execs(
  101. {{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  102. checker.set_param(Mode::ADD).execs(
  103. {{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  104. checker.set_param(Mode::ADD).execs(
  105. {{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  106. checker.set_param(Mode::RMULH)
  107. .execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  108. checker.set_param(Mode::RMULH)
  109. .execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  110. checker.set_param(Mode::RMULH)
  111. .execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  112. checker.set_param(Mode::RMULH)
  113. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  114. checker.set_param(Mode::RMULH)
  115. .execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  116. checker.set_param(Mode::FUSE_ADD_RELU)
  117. .execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  118. checker.set_param(Mode::FUSE_ADD_RELU)
  119. .execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  120. checker.set_param(Mode::FUSE_ADD_RELU)
  121. .execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  122. checker.set_param(Mode::FUSE_ADD_RELU)
  123. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  124. checker.set_param(Mode::FUSE_ADD_RELU)
  125. .execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  126. // BCAST101x_VEC not PowOp
  127. checker.set_param(Mode::ADD).execs(
  128. {{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  129. checker.set_param(Mode::ADD).execs(
  130. {{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  131. checker.set_param(Mode::ADD).execs(
  132. {{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  133. checker.set_param(Mode::ADD).execs(
  134. {{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  135. checker.set_param(Mode::ADD).execs(
  136. {{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  137. checker.set_param(Mode::FUSE_ADD_RELU)
  138. .execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  139. checker.set_param(Mode::FUSE_ADD_RELU)
  140. .execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  141. checker.set_param(Mode::FUSE_ADD_RELU)
  142. .execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  143. checker.set_param(Mode::FUSE_ADD_RELU)
  144. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  145. checker.set_param(Mode::FUSE_ADD_RELU)
  146. .execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  147. };
  148. checker.set_dtype(0, dtype::Int8());
  149. checker.set_dtype(1, dtype::Int8());
  150. run();
  151. checker.set_dtype(0, dtype::Int16());
  152. checker.set_dtype(1, dtype::Int16());
  153. run();
  154. checker.set_dtype(0, dtype::Int32());
  155. checker.set_dtype(1, dtype::Int32());
  156. run();
  157. }
  158. TEST_F(ARM_COMMON, ELEMWISE_FORWARD_NCHW44_FP32) {
  159. using Mode = ElemwiseForward::Param::Mode;
  160. Checker<ElemwiseForward> checker(handle());
  161. UniformFloatRNG rng(1e-5, 7e1);
  162. checker.set_rng(0, &rng);
  163. checker.set_epsilon(1e-5);
  164. checker.set_dtype(0, dtype::Float32());
  165. checker.set_dtype(1, dtype::Float32());
  166. checker.set_param(Mode::FUSE_ADD_RELU)
  167. .execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  168. checker.set_param(Mode::FUSE_ADD_RELU)
  169. .execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  170. checker.set_param(Mode::FUSE_ADD_RELU)
  171. .execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  172. checker.set_param(Mode::FUSE_ADD_RELU)
  173. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  174. checker.set_param(Mode::FUSE_ADD_RELU)
  175. .execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  176. checker.set_param(Mode::FUSE_ADD_RELU)
  177. .execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  178. checker.set_param(Mode::FUSE_ADD_RELU)
  179. .execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  180. checker.set_param(Mode::FUSE_ADD_RELU)
  181. .execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  182. checker.set_param(Mode::FUSE_ADD_RELU)
  183. .execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  184. checker.set_param(Mode::FUSE_ADD_RELU)
  185. .execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  186. auto run = [&](Mode mode) {
  187. // VEC_BCAST101x
  188. checker.set_param(mode).execs({{1, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  189. checker.set_param(mode).execs({{2, 3, 2, 2, 4}, {1, 3, 1, 1, 4}, {}});
  190. checker.set_param(mode).execs({{3, 8, 5, 3, 4}, {1, 8, 1, 1, 4}, {}});
  191. checker.set_param(mode).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  192. checker.set_param(mode).execs({{1, 2, 5, 7, 4}, {1, 2, 1, 1, 4}, {}});
  193. // BCAST101x_VEC not powOp
  194. checker.set_param(mode).execs({{1, 3, 1, 1, 4}, {1, 3, 2, 2, 4}, {}});
  195. checker.set_param(mode).execs({{1, 3, 1, 1, 4}, {2, 3, 2, 2, 4}, {}});
  196. checker.set_param(mode).execs({{1, 8, 1, 1, 4}, {3, 8, 5, 3, 4}, {}});
  197. checker.set_param(mode).execs({{3, 4, 5, 7, 4}, {3, 4, 5, 7, 4}, {}});
  198. checker.set_param(mode).execs({{1, 2, 1, 1, 4}, {1, 2, 5, 7, 4}, {}});
  199. };
  200. run(Mode::ADD);
  201. run(Mode::FUSE_ADD_H_SWISH);
  202. run(Mode::FUSE_ADD_RELU);
  203. run(Mode::MAX);
  204. run(Mode::MIN);
  205. run(Mode::MUL);
  206. run(Mode::SUB);
  207. run(Mode::TRUE_DIV);
  208. run(Mode::POW);
  209. }
  210. #if MEGDNN_WITH_BENCHMARK
  211. namespace {
  212. void run_elemwise_benchmark(const TensorShapeArray& shapes,
  213. param::Elemwise::Mode mode, const char* mode_str,
  214. DType type, Handle* handle_bench) {
  215. auto handle_fallback = create_cpu_handle(1);
  216. Benchmarker<Elemwise> benchmarker_bench(handle_bench);
  217. Benchmarker<Elemwise> benchmarker_fallback(handle_fallback.get());
  218. float throughput = 0;
  219. SmallVector<TensorLayout> layouts;
  220. std::string src_strs;
  221. for (size_t i = 0; i < shapes.size(); i++) {
  222. layouts.emplace_back(shapes[i], type);
  223. throughput += layouts.back().span().dist_byte();
  224. src_strs += layouts.back().to_string();
  225. if (i != shapes.size() - 1) {
  226. src_strs += ",";
  227. }
  228. }
  229. constexpr size_t RUN = 50;
  230. benchmarker_fallback.set_times(RUN).set_display(false);
  231. benchmarker_bench.set_times(RUN).set_display(false);
  232. benchmarker_fallback.set_param(mode);
  233. benchmarker_bench.set_param(mode);
  234. TensorLayout dst_layout;
  235. auto opr = handle_bench->create_operator<Elemwise>();
  236. opr->param() = mode;
  237. opr->deduce_layout(layouts, dst_layout);
  238. float computations = dst_layout.total_nr_elems() *
  239. (std::max<size_t>(shapes.size(), 2) - 1);
  240. throughput += dst_layout.span().dist_byte();
  241. computations *= (1e3 / (1024.0 * 1024));
  242. throughput *= (1e3 / (1024.0 * 1024));
  243. layouts.emplace_back(dst_layout);
  244. auto fallback_time = benchmarker_fallback.execl(layouts) / RUN;
  245. auto bench_time = benchmarker_bench.execl(layouts) / RUN;
  246. float fallback_flops = computations / fallback_time;
  247. float bench_flops = computations / bench_time;
  248. float fallback_thr = throughput / fallback_time;
  249. float bench_thr = throughput / bench_time;
  250. printf("%s = %s (type: %s, mode: %s) cpu=%fMFLOPS %fMB/s, bench=%fMFLOPS "
  251. "%fMB/s "
  252. "computations: %fx, throughput: %fx\n",
  253. src_strs.c_str(), dst_layout.to_string().c_str(), type.name(),
  254. mode_str, fallback_flops, fallback_thr, bench_flops, bench_thr,
  255. bench_flops / fallback_flops, bench_thr / fallback_thr);
  256. }
  257. } // namespace
  258. #define INT_RUN(shape, mode) \
  259. run_elemwise_benchmark(shape, mode, #mode, dtype::Int8{}, handle()); \
  260. run_elemwise_benchmark(shape, mode, #mode, dtype::Int16{}, handle()); \
  261. run_elemwise_benchmark(shape, mode, #mode, dtype::Int32{}, handle());
  262. #define FLOAT_RUN(shape, mode) \
  263. run_elemwise_benchmark(shape, mode, #mode, dtype::Float32{}, handle()); \
  264. run_elemwise_benchmark(shape, mode, #mode, dtype::Float16{}, handle());
  265. #define BENCHMARK_CASES(shape) \
  266. INT_BENCHMARK_CASES(shape) \
  267. FLOAT_BENCHMARK_CASES(shape)
  268. TEST_F(ARM_COMMON, BENCHMARK_UNARY) {
  269. #define INT_BENCHMARK_CASES(shape) \
  270. INT_RUN(shape, Mode::RELU); \
  271. INT_RUN(shape, Mode::ABS);
  272. #define FLOAT_BENCHMARK_CASES(shape) \
  273. FLOAT_RUN(shape, Mode::RELU); \
  274. FLOAT_RUN(shape, Mode::ABS); \
  275. FLOAT_RUN(shape, Mode::SIGMOID); \
  276. FLOAT_RUN(shape, Mode::EXP); \
  277. FLOAT_RUN(shape, Mode::TANH); \
  278. FLOAT_RUN(shape, Mode::FAST_TANH);
  279. using Mode = param::Elemwise::Mode;
  280. BENCHMARK_CASES({{10000}});
  281. BENCHMARK_CASES({{50000}});
  282. #undef INT_BENCHMARK_CASES
  283. #undef FLOAT_BENCHMARK_CASES
  284. }
  285. TEST_F(ARM_COMMON, BENCHMARK_BINARY) {
  286. #define INT_BENCHMARK_CASES(shape) \
  287. INT_RUN(shape, Mode::MIN); \
  288. INT_RUN(shape, Mode::MAX); \
  289. INT_RUN(shape, Mode::ADD); \
  290. INT_RUN(shape, Mode::SUB); \
  291. INT_RUN(shape, Mode::MUL); \
  292. INT_RUN(shape, Mode::RMULH); \
  293. INT_RUN(shape, Mode::FUSE_ADD_RELU);
  294. #define FLOAT_BENCHMARK_CASES(shape) \
  295. FLOAT_RUN(shape, Mode::MIN); \
  296. FLOAT_RUN(shape, Mode::MAX); \
  297. FLOAT_RUN(shape, Mode::ADD); \
  298. FLOAT_RUN(shape, Mode::SUB); \
  299. FLOAT_RUN(shape, Mode::MUL); \
  300. FLOAT_RUN(shape, Mode::POW); \
  301. FLOAT_RUN(shape, Mode::TRUE_DIV); \
  302. FLOAT_RUN(shape, Mode::FUSE_ADD_RELU);
  303. using Mode = param::Elemwise::Mode;
  304. TensorShapeArray shapes = {{1, 112, 28, 28}, {1, 112, 28, 28}};
  305. BENCHMARK_CASES(shapes);
  306. shapes = {{1, 16, 1, 1}, {1, 16, 112, 112}};
  307. BENCHMARK_CASES(shapes);
  308. shapes = {{1, 448, 7, 7}, {1, 448, 7, 7}};
  309. BENCHMARK_CASES(shapes);
  310. #undef INT_BENCHMARK_CASES
  311. #undef FLOAT_BENCHMARK_CASES
  312. }
  313. TEST_F(ARM_COMMON, BENCHMARK_TERNARY_FMA3) {
  314. #define INT_BENCHMARK_CASES(shape) INT_RUN(shape, Mode::FUSE_MUL_ADD3);
  315. #define FLOAT_BENCHMARK_CASES(shape) FLOAT_RUN(shape, Mode::FUSE_MUL_ADD3);
  316. using Mode = param::Elemwise::Mode;
  317. TensorShapeArray shapes = {{30, 40, 70}, {30, 40, 70}, {30, 40, 70}};
  318. BENCHMARK_CASES(shapes);
  319. shapes = {{1, 4, 1, 1}, {3, 4, 5, 7}, {1, 4, 1, 1}};
  320. BENCHMARK_CASES(shapes);
  321. shapes = {{3, 4, 5, 7}, {3, 4, 5, 7}, {1, 1, 1, 1}};
  322. BENCHMARK_CASES(shapes);
  323. #undef INT_BENCHMARK_CASES
  324. #undef FLOAT_BENCHMARK_CASES
  325. }
  326. #undef BENCHMARK_CASES
  327. #undef INT_RUN
  328. #undef FLOAT_RUN
  329. #endif
  330. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台