You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

pooling.cpp 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594
  1. #include "test/arm_common/fixture.h"
  2. #include "test/common/benchmarker.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/pooling.h"
  5. #include "test/common/rng.h"
  6. #include "test/common/task_record_check.h"
  7. namespace megdnn {
  8. namespace test {
  9. TEST_F(ARM_COMMON, POOLING) {
  10. using Param = param::Pooling;
  11. // clang-format off
  12. for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  13. for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  14. for (size_t p: {1, 2})
  15. {
  16. Param param;
  17. param.mode = Param::Mode::MAX;
  18. param.window_h = param.window_w = 3;
  19. param.stride_h = param.stride_w = 2;
  20. param.pad_h = param.pad_w = p;
  21. Checker<Pooling> checker(handle());
  22. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  23. param.mode = Param::Mode::AVERAGE;
  24. param.window_h = param.window_w = 3;
  25. param.stride_h = param.stride_w = 2;
  26. param.pad_h = param.pad_w = p;
  27. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  28. param.mode = Param::Mode::MAX;
  29. param.window_h = param.window_w = 4;
  30. param.stride_h = param.stride_w = 2;
  31. param.pad_h = param.pad_w = p;
  32. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  33. param.mode = Param::Mode::MAX;
  34. param.window_h = param.window_w = 5;
  35. param.stride_h = param.stride_w = 2;
  36. param.pad_h = param.pad_w = p;
  37. if (ih + p * 2 >= 5 && iw + p * 2 >= 5)
  38. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  39. }
  40. for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  41. for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  42. for (size_t p: {1, 2})
  43. {
  44. Param param;
  45. param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  46. param.window_h = param.window_w = 3;
  47. param.stride_h = param.stride_w = 1;
  48. param.pad_h = param.pad_w = p;
  49. Checker<Pooling> checker(handle());
  50. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  51. }
  52. // clang-format on
  53. }
  54. TEST_F(ARM_COMMON, POOLING_RECORD) {
  55. using Param = param::Pooling;
  56. TaskRecordChecker<Pooling> checker(0);
  57. // clang-format off
  58. for (size_t ih: {2, 3, 5, 7, 11, 13, 17})
  59. for (size_t iw: {2, 3, 5, 7, 11, 13, 17})
  60. for (size_t p: {1, 2})
  61. {
  62. Param param;
  63. param.mode = Param::Mode::MAX;
  64. param.window_h = param.window_w = 3;
  65. param.stride_h = param.stride_w = 2;
  66. param.pad_h = param.pad_w = p;
  67. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  68. param.mode = Param::Mode::AVERAGE;
  69. param.window_h = param.window_w = 3;
  70. param.stride_h = param.stride_w = 2;
  71. param.pad_h = param.pad_w = p;
  72. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  73. param.mode = Param::Mode::MAX;
  74. param.window_h = param.window_w = 4;
  75. param.stride_h = param.stride_w = 2;
  76. param.pad_h = param.pad_w = p;
  77. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  78. param.mode = Param::Mode::MAX;
  79. param.window_h = param.window_w = 5;
  80. param.stride_h = param.stride_w = 2;
  81. param.pad_h = param.pad_w = p;
  82. if (ih + p * 2 >= 5 && iw + p * 2 >= 5)
  83. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  84. }
  85. for (size_t ih: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  86. for (size_t iw: {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  87. for (size_t p: {1, 2})
  88. {
  89. Param param;
  90. param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
  91. param.window_h = param.window_w = 3;
  92. param.stride_h = param.stride_w = 1;
  93. param.pad_h = param.pad_w = p;
  94. Checker<Pooling> checker(handle());
  95. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  96. }
  97. // clang-format on
  98. }
  99. TEST_F(ARM_COMMON, POOLING_INT8_W2x2_S2x2) {
  100. // clang-format off
  101. for (size_t ih: {2, 3, 7, 13, 52, 53, 54, 55})
  102. for (size_t iw: {2, 3, 6, 14, 53, 54, 55, 56})
  103. for (size_t ph: {0, 1})
  104. for (size_t pw: {0, 1})
  105. if (ih+2*ph >= 3 && iw+2*pw >= 3)
  106. {
  107. Checker<Pooling> checker(handle());
  108. checker.set_dtype(0, dtype::Int8());
  109. param::Pooling param;
  110. param.mode = param::Pooling::Mode::MAX;
  111. param.pad_h = ph;
  112. param.pad_w = pw;
  113. param.stride_h = param.stride_w = 2;
  114. param.window_h = param.window_w = 2;
  115. checker.set_param(param).exec(TensorShapeArray{{2, 3, ih, iw}, {}});
  116. }
  117. // clang-format on
  118. }
  119. TEST_F(ARM_COMMON, POOLING_INT8_W3x3_S2x2) {
  120. // clang-format off
  121. for (size_t ih: {2, 3, 7, 13, 52, 53, 54, 55})
  122. for (size_t iw: {2, 3, 6, 14, 53, 54, 55, 56})
  123. for (size_t ph: {0, 1, 2})
  124. for (size_t pw: {0, 1, 2})
  125. if (ih+2*ph >= 3 && iw+2*pw >= 3)
  126. {
  127. Checker<Pooling> checker(handle());
  128. checker.set_dtype(0, dtype::Int8());
  129. param::Pooling param;
  130. param.mode = param::Pooling::Mode::MAX;
  131. param.pad_h = ph;
  132. param.pad_w = pw;
  133. param.stride_h = param.stride_w = 2;
  134. param.window_h = param.window_w = 3;
  135. checker.set_param(param).exec(TensorShapeArray{{2, 3, ih, iw}, {}});
  136. }
  137. // clang-format on
  138. }
  139. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  140. TEST_F(ARM_COMMON, POOLING_FP16) {
  141. Checker<Pooling> checker(handle());
  142. checker.set_dtype(0, dtype::Float16{})
  143. .set_dtype(1, dtype::Float16{})
  144. .set_epsilon(3e-3);
  145. using Param = param::Pooling;
  146. for (size_t ih : {2, 3, 5, 7, 11, 13, 17, 19, 23})
  147. for (size_t iw : {2, 3, 5, 7, 11, 13, 17, 19, 23})
  148. for (auto mode : {Param::Mode::AVERAGE, Param::Mode::MAX}) {
  149. for (size_t window : {2, 3}) {
  150. Param param;
  151. param.mode = mode;
  152. param.window_h = param.window_w = window;
  153. param.stride_h = param.stride_w = 1;
  154. param.pad_h = param.pad_w = window / 2;
  155. //! test for SH == 1 && SW == 1 && FH == FW (FH == 2 || FH
  156. //! == 3)
  157. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  158. //! test for SH = SW = 2 && FH = FW = 2
  159. param.stride_h = param.stride_w = 2;
  160. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  161. }
  162. }
  163. //! test for SH == 2 && SW == 2 && FH == FW == 3 max pooling
  164. for (size_t ih : {2, 3, 7, 13, 52, 53, 54, 55})
  165. for (size_t iw : {2, 3, 6, 14, 53, 54, 55, 56})
  166. for (size_t ph : {0, 1, 2})
  167. for (size_t pw : {0, 1, 2})
  168. if (ih + 2 * ph >= 3 && iw + 2 * pw >= 3) {
  169. param::Pooling param;
  170. param.mode = param::Pooling::Mode::MAX;
  171. param.pad_h = ph;
  172. param.pad_w = pw;
  173. param.stride_h = param.stride_w = 2;
  174. param.window_h = param.window_w = 3;
  175. checker.set_param(param).exec(
  176. TensorShapeArray{{2, 3, ih, iw}, {}});
  177. }
  178. //! test for SH == 2 && SW == 2 && FH = FW = 4 max pooling
  179. for (size_t ih : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  180. for (size_t iw : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  181. for (size_t p : {1, 2}) {
  182. Param param;
  183. param.mode = Param::Mode::MAX;
  184. param.window_h = param.window_w = 4;
  185. param.stride_h = param.stride_w = 2;
  186. param.pad_h = param.pad_w = p;
  187. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  188. }
  189. //! test for SH == 2 && SW == 2 && FH = FW = 5 max pooling
  190. for (size_t ih : {3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  191. for (size_t iw : {3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  192. for (size_t p : {1, 2}) {
  193. Param param;
  194. param.mode = Param::Mode::MAX;
  195. param.window_h = param.window_w = 5;
  196. param.stride_h = param.stride_w = 2;
  197. param.pad_h = param.pad_w = p;
  198. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  199. }
  200. }
  201. #endif
  202. TEST_F(ARM_COMMON, POOLING_QUANTIZED) {
  203. Checker<Pooling> checker(handle());
  204. UniformIntRNG rng1{INT8_MIN >> 1, INT8_MAX >> 1};
  205. UniformIntRNG rng2{0, UINT8_MAX >> 1};
  206. using Param = param::Pooling;
  207. for (auto type : std::vector<DType>{
  208. dtype::QuantizedS8(1.1f),
  209. dtype::Quantized8Asymm(1.1f, static_cast<uint8_t>(3))}) {
  210. if (type.enumv() == DTypeEnum::QuantizedS8) {
  211. checker.set_rng(0, &rng1);
  212. } else {
  213. megdnn_assert(type.enumv() == DTypeEnum::Quantized8Asymm);
  214. checker.set_rng(0, &rng2);
  215. }
  216. for (size_t ih : {2, 3, 5, 7, 11, 13, 17, 19, 23, 33, 49})
  217. for (size_t iw : {2, 3, 5, 7, 11, 13, 17, 19, 23, 33, 49})
  218. for (auto mode : {Param::Mode::AVERAGE, Param::Mode::MAX}) {
  219. for (size_t window : {2, 3}) {
  220. Param param;
  221. param.mode = mode;
  222. param.window_h = param.window_w = window;
  223. param.stride_h = param.stride_w = 1;
  224. param.pad_h = param.pad_w = window / 2;
  225. //! test for SH == 1 && SW == 1 && FH == FW (FH == 2 ||
  226. //! FH
  227. //! == 3)
  228. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  229. //! test for SH = SW = 2 && FH = FW = 2
  230. param.stride_h = param.stride_w = 2;
  231. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  232. }
  233. }
  234. //! test for SH == 2 && SW == 2 && FH == FW == 3 max pooling
  235. for (size_t ih : {2, 3, 7, 13, 52, 53, 54, 55})
  236. for (size_t iw : {2, 3, 6, 14, 53, 54, 55, 56})
  237. for (size_t ph : {0, 1, 2})
  238. for (size_t pw : {0, 1, 2})
  239. if (ih + 2 * ph >= 3 && iw + 2 * pw >= 3) {
  240. param::Pooling param;
  241. param.mode = param::Pooling::Mode::MAX;
  242. param.pad_h = ph;
  243. param.pad_w = pw;
  244. param.window_h = param.window_w = 3;
  245. param.stride_h = param.stride_w = 2;
  246. checker.set_param(param).exec(
  247. TensorShapeArray{{2, 3, ih, iw}, {}});
  248. }
  249. //! test for SH == 2 && SW == 2 && FH == FW == 4 max pooling
  250. for (size_t ih : {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  251. for (size_t iw :
  252. {2, 3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  253. for (size_t p : {1, 2}) {
  254. Param param;
  255. param.mode = Param::Mode::MAX;
  256. param.window_h = param.window_w = 4;
  257. param.stride_h = param.stride_w = 2;
  258. param.pad_h = param.pad_w = p;
  259. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  260. }
  261. //! test for SH == 2 && SW == 2 && FH == FW == 5 max pooling
  262. for (size_t ih : {3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  263. for (size_t iw : {3, 5, 7, 11, 13, 17, 19, 23, 24, 25, 26, 27, 28, 29, 30})
  264. for (size_t p : {1, 2}) {
  265. Param param;
  266. param.mode = Param::Mode::MAX;
  267. param.window_h = param.window_w = 5;
  268. param.stride_h = param.stride_w = 2;
  269. param.pad_h = param.pad_w = p;
  270. checker.set_param(param).exec({{2, 3, ih, iw}, {}});
  271. }
  272. }
  273. }
  274. #if MEGDNN_WITH_BENCHMARK
  275. void benchmark_nchw44_fp32(Handle* handle) {
  276. using Param = param::Pooling;
  277. auto run = [&](size_t n, size_t c, size_t h, size_t w, size_t filter, size_t stride,
  278. size_t pad, Param::Mode mode) {
  279. Param param;
  280. param.window_h = param.window_w = filter;
  281. param.stride_h = param.stride_w = stride;
  282. param.pad_h = param.pad_w = pad;
  283. param.format = Param::Format::NCHW;
  284. param.mode = mode;
  285. TensorShape nchw_shape = {n, c, h, w};
  286. TensorShape nchw44_shape = {n, c / 4, h, w, 4};
  287. TensorLayout dst_layout;
  288. auto opr = handle->create_operator<Pooling>();
  289. opr->param() = param;
  290. opr->deduce_layout({nchw_shape, dtype::Float32()}, dst_layout);
  291. float calc_amount =
  292. dst_layout.total_nr_elems() * param.window_h * param.window_w;
  293. Benchmarker<Pooling> benchmarker_float_nchw(handle);
  294. Benchmarker<Pooling> benchmarker_float_nchw44(handle);
  295. Benchmarker<Pooling> benchmarker_int_nchw44(handle);
  296. size_t RUN = 500;
  297. auto t1 = benchmarker_float_nchw.set_display(false)
  298. .set_times(RUN)
  299. .set_param(param)
  300. .exec({nchw_shape, {}});
  301. param.format = Param::Format::NCHW44;
  302. auto t2 = benchmarker_int_nchw44.set_display(false)
  303. .set_times(RUN)
  304. .set_param(param)
  305. .execl({{nchw44_shape, dtype::QuantizedS8(1.0)},
  306. {{}, dtype::QuantizedS8(1.0)}});
  307. auto t3 = benchmarker_float_nchw44.set_display(false)
  308. .set_times(RUN)
  309. .set_param(param)
  310. .exec({nchw44_shape, {}});
  311. printf("{%zu %zu %zu %zu} filter = %zu, stride = %zu pad = %zu\n"
  312. "nchw_fp32={%.3f ms, %.3f Mflops}, "
  313. "nchw44_int={%.3f ms, %.3f Mflops}, "
  314. "nchw44_fp32={%.3f ms, %.3f Mflops, speed_up %f}\n\n",
  315. n, c, h, w, filter, stride, pad, t1 / RUN,
  316. calc_amount / (t1 / RUN * 1000), t2 / RUN,
  317. calc_amount / (t2 / RUN * 1000), t3 / RUN,
  318. calc_amount / (t3 / RUN * 1000), t1 / t3);
  319. };
  320. // Resnet50
  321. run(1, 64, 112, 112, 3, 2, 1, param::Pooling::Mode::MAX);
  322. run(1, 2048, 7, 7, 7, 1, 0, param::Pooling::Mode::AVERAGE);
  323. // VGG16
  324. run(1, 64, 224, 224, 2, 2, 0, param::Pooling::Mode::MAX);
  325. run(1, 128, 112, 112, 2, 2, 0, param::Pooling::Mode::MAX);
  326. run(1, 256, 56, 56, 2, 2, 0, param::Pooling::Mode::MAX);
  327. run(1, 512, 28, 28, 2, 2, 0, param::Pooling::Mode::MAX);
  328. run(1, 512, 14, 14, 2, 2, 0, param::Pooling::Mode::MAX);
  329. }
  330. TEST_F(ARM_COMMON, BENCHMARK_POOLING_NCHW44_FP32) {
  331. benchmark_nchw44_fp32(handle());
  332. }
  333. TEST_F(ARM_COMMON_MULTI_THREADS, BENCHMARK_POOLING_NCHW44_FP32) {
  334. benchmark_nchw44_fp32(handle());
  335. }
  336. TEST_F(ARM_COMMON, BENCHMARK_POOLING_INT8_W3x3_S2x2) {
  337. using Param = param::Pooling;
  338. auto run = [&](const TensorShapeArray& shapes, Param param) {
  339. auto handle_naive = create_cpu_handle(2);
  340. TensorLayoutArray layouts;
  341. layouts.emplace_back(shapes[0], dtype::Int8());
  342. layouts.emplace_back(shapes[1], dtype::Int8());
  343. Benchmarker<Pooling> benchmarker_naive(handle_naive.get());
  344. Benchmarker<Pooling> benchmarker_float(handle());
  345. Benchmarker<Pooling> benchmarker_int(handle());
  346. size_t RUN = 10;
  347. auto t1 = benchmarker_naive.set_display(false)
  348. .set_times(RUN)
  349. .set_param(param)
  350. .exec(shapes);
  351. auto t2 = benchmarker_float.set_display(false)
  352. .set_times(RUN)
  353. .set_param(param)
  354. .exec(shapes);
  355. auto t3 = benchmarker_int.set_display(false)
  356. .set_times(RUN)
  357. .set_param(param)
  358. .execl(layouts);
  359. printf("naive=%.3fms float=%.3fms, int=%.3fms\n", t1 / RUN, t2 / RUN, t3 / RUN);
  360. auto speedup = t2 / t3;
  361. ASSERT_GE(speedup, 2.0);
  362. };
  363. Param param;
  364. param.window_h = param.window_w = 3;
  365. param.stride_h = param.stride_w = 2;
  366. param.pad_h = param.pad_w = 1;
  367. std::cout << "3x3 with 2x2 stride max pooling:" << std::endl;
  368. run({{1, 3, 640, 480}, {}}, param);
  369. }
  370. TEST_F(ARM_COMMON, BENCHMARK_POOLING_W4x4_S2x2) {
  371. using Param = param::Pooling;
  372. auto run = [&](const TensorShapeArray& shapes, Param param) {
  373. std::cout << "N:" << shapes[0][0] << " "
  374. << "IC:" << shapes[0][1] << " "
  375. << "IH:" << shapes[0][2] << " "
  376. << "IW:" << shapes[0][3] << std::endl;
  377. auto handle_naive = create_cpu_handle(2);
  378. Benchmarker<Pooling> benchmarker_naive(handle_naive.get());
  379. Benchmarker<Pooling> benchmarker_float(handle());
  380. size_t RUN = 10;
  381. auto t1 = benchmarker_naive.set_display(false)
  382. .set_times(RUN)
  383. .set_param(param)
  384. .exec(shapes);
  385. auto t2 = benchmarker_float.set_display(false)
  386. .set_times(RUN)
  387. .set_param(param)
  388. .exec(shapes);
  389. TensorLayout dst_layout;
  390. auto opr = handle()->create_operator<Pooling>();
  391. opr->param() = param;
  392. opr->deduce_layout({shapes[0], dtype::Float32()}, dst_layout);
  393. float calc_amount =
  394. dst_layout.total_nr_elems() * param.window_h * param.window_w;
  395. printf("naive={%.3fms, %.3fMflops}, neon={%.3fms, %.3fMflops}\n", t1 / RUN,
  396. calc_amount / (t1 / RUN * 1000), t2 / RUN,
  397. calc_amount / (t2 / RUN * 1000));
  398. };
  399. Param param;
  400. param.window_h = param.window_w = 4;
  401. param.stride_h = param.stride_w = 2;
  402. param.pad_h = param.pad_w = 1;
  403. std::cout << "4x4 with 2x2 stride max pooling:" << std::endl;
  404. run({{1, 24, 160, 128}, {}}, param);
  405. run({{1, 4, 240, 135}, {}}, param);
  406. run({{1, 32, 120, 67}, {}}, param);
  407. run({{1, 64, 60, 33}, {}}, param);
  408. }
  409. TEST_F(ARM_COMMON, BENCHMARK_POOLING_W5x5_S2x2) {
  410. using Param = param::Pooling;
  411. auto run = [&](const TensorShapeArray& shapes, Param param) {
  412. std::cout << "N:" << shapes[0][0] << " "
  413. << "IC:" << shapes[0][1] << " "
  414. << "IH:" << shapes[0][2] << " "
  415. << "IW:" << shapes[0][3] << std::endl;
  416. auto handle_naive = create_cpu_handle(2);
  417. Benchmarker<Pooling> benchmarker_naive(handle_naive.get());
  418. Benchmarker<Pooling> benchmarker_float(handle());
  419. size_t RUN = 10;
  420. auto t1 = benchmarker_naive.set_display(false)
  421. .set_times(RUN)
  422. .set_param(param)
  423. .exec(shapes);
  424. auto t2 = benchmarker_float.set_display(false)
  425. .set_times(RUN)
  426. .set_param(param)
  427. .exec(shapes);
  428. TensorLayout dst_layout;
  429. auto opr = handle()->create_operator<Pooling>();
  430. opr->param() = param;
  431. opr->deduce_layout({shapes[0], dtype::Float32()}, dst_layout);
  432. float calc_amount =
  433. dst_layout.total_nr_elems() * param.window_h * param.window_w;
  434. printf("naive={%.3fms, %.3fMflops}, neon={%.3fms, %.3fMflops}\n", t1 / RUN,
  435. calc_amount / (t1 / RUN * 1000), t2 / RUN,
  436. calc_amount / (t2 / RUN * 1000));
  437. };
  438. Param param;
  439. param.window_h = param.window_w = 5;
  440. param.stride_h = param.stride_w = 2;
  441. param.pad_h = param.pad_w = 1;
  442. std::cout << "5x5 with 2x2 stride max pooling:" << std::endl;
  443. run({{1, 24, 160, 128}, {}}, param);
  444. run({{1, 4, 240, 135}, {}}, param);
  445. run({{1, 32, 120, 67}, {}}, param);
  446. run({{1, 64, 60, 33}, {}}, param);
  447. }
  448. TEST_F(ARM_COMMON, BENCHMARK_POOLING_FP16) {
  449. using Param = param::Pooling;
  450. auto run = [&](const TensorShapeArray& shapes, Param param) {
  451. TensorLayoutArray layouts;
  452. layouts.emplace_back(shapes[0], dtype::Float16());
  453. layouts.emplace_back(shapes[1], dtype::Float16());
  454. Benchmarker<Pooling> benchmarker_float(handle());
  455. Benchmarker<Pooling> benchmarker_half(handle());
  456. size_t RUN = 10;
  457. auto tf = benchmarker_float.set_display(false)
  458. .set_times(RUN)
  459. .set_param(param)
  460. .exec(shapes) /
  461. RUN;
  462. auto th = benchmarker_half.set_display(false)
  463. .set_times(RUN)
  464. .set_param(param)
  465. .execl(layouts) /
  466. RUN;
  467. TensorLayout dst_layout;
  468. auto opr = handle()->create_operator<Pooling>();
  469. opr->param() = param;
  470. opr->deduce_layout({shapes[0], dtype::Float32()}, dst_layout);
  471. float computations = dst_layout.total_nr_elems() * param.window_h *
  472. param.window_w / (1024.f * 1024 * 1024);
  473. printf("float=%.3fms %f gflops, float16=%.3fms %f gflops speedup: %f\n", tf,
  474. computations / tf * 1e3, th, computations / th * 1e3, tf / th);
  475. };
  476. Param param;
  477. param.window_h = param.window_w = 2;
  478. param.stride_h = param.stride_w = 1;
  479. param.pad_h = param.pad_w = 1;
  480. printf("2x2 with 1x1 stride max pooling:\n");
  481. run({{1, 3, 640, 480}, {}}, param);
  482. for (size_t oh : {640, 128})
  483. for (size_t ow : {480, 112}) {
  484. param.window_h = param.window_w = 3;
  485. param.stride_h = param.stride_w = 2;
  486. param.pad_h = param.pad_w = 1;
  487. param.mode = Param::Mode::AVERAGE;
  488. printf("3x3 with 2x2 stride average pooling.\n");
  489. run({{1, 3, oh, ow}, {}}, param);
  490. for (size_t pw : {2, 3, 4, 5}) {
  491. param.window_h = param.window_w = pw;
  492. param.stride_h = param.stride_w = 2;
  493. param.pad_h = param.pad_w = 1;
  494. param.mode = Param::Mode::MAX;
  495. printf("%zux%zu with 2x2 stride max pooling:\n", pw, pw);
  496. run({{1, 3, oh, ow}, {}}, param);
  497. }
  498. }
  499. }
  500. TEST_F(ARM_COMMON, BENCHMARK_POOLING_QUANTIZED) {
  501. using Param = param::Pooling;
  502. auto run = [&](const TensorShapeArray& shapes, Param param) {
  503. auto handle_naive = create_cpu_handle(2);
  504. TensorLayoutArray layouts;
  505. layouts.emplace_back(shapes[0], dtype::QuantizedS8(1.1f));
  506. layouts.emplace_back(shapes[1], dtype::QuantizedS8(1.1f));
  507. Benchmarker<Pooling> benchmarker_int(handle());
  508. Benchmarker<Pooling> benchmarker_naive(handle_naive.get());
  509. size_t RUN = 10;
  510. auto time_int =
  511. benchmarker_int.set_display(false).set_times(RUN).set_param(param).exec(
  512. shapes) /
  513. RUN;
  514. auto time_naive = benchmarker_naive.set_display(false)
  515. .set_times(RUN)
  516. .set_param(param)
  517. .execl(layouts) /
  518. RUN;
  519. TensorLayout dst_layout;
  520. auto opr = handle()->create_operator<Pooling>();
  521. opr->param() = param;
  522. opr->deduce_layout({shapes[0], dtype::QuantizedS8(1.1f)}, dst_layout);
  523. float computations = dst_layout.total_nr_elems() * param.window_h *
  524. param.window_w / (1024.f * 1024 * 1024);
  525. printf("naive=%.3fms %f gflops, int8=%.3fms %f gflops speedup: %f\n",
  526. time_naive, computations / time_naive * 1e3, time_int,
  527. computations / time_int * 1e3, time_naive / time_int);
  528. };
  529. Param param;
  530. param.window_h = param.window_w = 2;
  531. param.stride_h = param.stride_w = 1;
  532. param.pad_h = param.pad_w = 1;
  533. printf("2x2 with 1x1 stride max pooling:\n");
  534. run({{1, 3, 640, 480}, {}}, param);
  535. // clang-format off
  536. for (size_t oh : {640, 128})
  537. for (size_t ow : {480, 112})
  538. for (size_t pw : {2, 3, 4, 5}) {
  539. param.window_h = param.window_w = pw;
  540. param.stride_h = param.stride_w = 2;
  541. param.pad_h = param.pad_w = 1;
  542. printf("%zux%zu with 2x2 stride max pooling:\n", pw, pw);
  543. run({{1, 3, oh, ow}, {}}, param);
  544. }
  545. // clang-format on
  546. }
  547. #endif
  548. } // namespace test
  549. } // namespace megdnn
  550. // vim: syntax=cpp.doxygen