You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. /**
  2. * \file dnn/test/aarch64/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/aarch64/fixture.h"
  12. #include "test/common/benchmarker.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/convolution.h"
  15. #include "test/common/rng.h"
  16. using namespace megdnn;
  17. using namespace test;
  18. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  19. TEST_F(AARCH64, CONVOLUTION_BACKWARD_DATA_FP16) {
  20. Checker<ConvolutionBackwardData> checker(handle());
  21. using Param = ConvolutionBackwardData::Param;
  22. Param param;
  23. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  24. size_t fw, size_t stride, size_t padding, size_t group = 1) {
  25. param.pad_h = param.pad_w = padding;
  26. param.stride_h = param.stride_w = stride;
  27. TensorLayout diff = TensorLayout{{n, oc * group, oh, ow}, dtype::Float16()};
  28. TensorLayout grad;
  29. TensorLayout filter;
  30. if (group == 1) {
  31. param.sparse = Param::Sparse::DENSE;
  32. filter = {{oc, ic, fh, fw}, dtype::Float16()};
  33. } else {
  34. param.sparse = Param::Sparse::GROUP;
  35. filter = {{group, oc, ic, fh, fw}, dtype::Float16()};
  36. }
  37. // TensorLayout grad;
  38. {
  39. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  40. opr->param() = param;
  41. opr->deduce_layout(filter, diff, grad);
  42. }
  43. NormalRNG rng(10.f);
  44. checker.set_param(param)
  45. .set_dtype(0, dtype::Float16())
  46. .set_dtype(1, dtype::Float16())
  47. .set_dtype(2, dtype::Float16())
  48. .set_rng(0, &rng)
  49. .set_rng(1, &rng)
  50. .set_epsilon(1e-2)
  51. .set_before_exec_callback(
  52. AlgoChecker<ConvolutionBackwardData>("DeconvMatmul"));
  53. checker.exec(TensorLayoutArray{filter, diff, grad});
  54. };
  55. for (auto mode : {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) {
  56. param.mode = mode;
  57. run(4, 3, 10, 13, 5, 1, 1, 1, 0, 1);
  58. run(4, 3, 10, 45, 2, 1, 1, 1, 0, 4);
  59. run(2, 3, 9, 12, 2, 4, 6, 1, 0, 1);
  60. run(3, 4, 17, 32, 2, 3, 2, 5, 4, 4);
  61. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 2);
  62. run(2, 3, 20, 33, 3, 5, 7, 4, 15, 2);
  63. run(4, 4, 6, 7, 9, 3, 2, 2, 1, 3);
  64. }
  65. }
  66. #if MEGDNN_WITH_BENCHMARK
  67. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_QUICK_FP16) {
  68. int exec_times = 10;
  69. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  70. benchmarker_gemm.set_times(exec_times);
  71. float mod = 1000 * exec_times / 1e9;
  72. auto run = [&](size_t M, size_t K, size_t N) {
  73. float time = 1.f, perf = 1.f;
  74. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")" << std::endl;
  75. benchmarker_gemm.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  76. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  77. perf = 2.f * M * K * N / time * mod;
  78. std::cout << "gemm fp32, Performance is " << perf << " Gflops" << std::endl;
  79. benchmarker_gemm.set_dtype(0, dtype::Float16()).set_dtype(1, dtype::Float16());
  80. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  81. perf = 2.f * M * K * N / time * mod;
  82. std::cout << "gemm fp16, Performance is " << perf << " Gflops" << std::endl;
  83. };
  84. // run M = K = N
  85. run(32, 32, 32);
  86. run(64, 64, 64);
  87. run(128, 128, 128);
  88. run(256, 256, 256);
  89. run(512, 512, 512);
  90. run(1024, 1024, 1024);
  91. run(2048, 2048, 2048);
  92. }
  93. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_ALL_SIZES_FP16) {
  94. int exec_times = 10;
  95. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  96. benchmarker_gemm.set_times(exec_times);
  97. float mod = 1000 * exec_times / 1e9;
  98. auto run = [&](size_t M, size_t K, size_t N) {
  99. float time = 1.f, perf = 1.f;
  100. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")" << std::endl;
  101. benchmarker_gemm.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  102. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  103. perf = 2.f * M * K * N / time * mod;
  104. std::cout << "gemm fp32, Performance is " << perf << " Gflops" << std::endl;
  105. benchmarker_gemm.set_dtype(0, dtype::Float16()).set_dtype(1, dtype::Float16());
  106. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  107. perf = 2.f * M * K * N / time * mod;
  108. std::cout << "gemm fp16, Performance is " << perf << " Gflops" << std::endl;
  109. };
  110. std::cout << "warm up:\n";
  111. for (int i = 0; i < 50; i++) {
  112. benchmarker_gemm.set_dtype(0, dtype::Float32())
  113. .set_dtype(1, dtype::Float32())
  114. .set_display(false)
  115. .exec({{256, 256}, {256, 256}, {}});
  116. benchmarker_gemm.set_display(true);
  117. }
  118. // run M = K = N
  119. run(8, 8, 8);
  120. run(16, 16, 16);
  121. run(32, 32, 32);
  122. run(64, 64, 64);
  123. run(128, 128, 128);
  124. run(256, 256, 256);
  125. run(512, 512, 512);
  126. run(1024, 1024, 1024);
  127. run(2048, 2048, 2048);
  128. // run sgmev like
  129. run(32, 32, 1);
  130. run(64, 64, 1);
  131. run(128, 128, 1);
  132. run(256, 256, 1);
  133. run(512, 512, 1);
  134. // run M, N >> K
  135. run(32, 16, 32);
  136. run(64, 16, 64);
  137. run(128, 16, 128);
  138. run(256, 16, 256);
  139. run(512, 16, 512);
  140. // run N, K >> M
  141. run(16, 32, 32);
  142. run(16, 64, 64);
  143. run(16, 128, 128);
  144. run(16, 256, 256);
  145. run(16, 512, 512);
  146. // run M >> K, N
  147. run(32, 16, 16);
  148. run(64, 16, 16);
  149. run(128, 16, 16);
  150. run(256, 16, 16);
  151. run(512, 16, 16);
  152. // run K >> M, N
  153. run(16, 32, 16);
  154. run(16, 64, 16);
  155. run(16, 128, 16);
  156. run(16, 256, 16);
  157. run(16, 512, 16);
  158. // run N >> M, K
  159. run(16, 16, 32);
  160. run(16, 16, 64);
  161. run(16, 16, 128);
  162. run(16, 16, 256);
  163. run(16, 16, 512);
  164. // run VGG
  165. // conv 1.1
  166. run(64, 3 * 3 * 3, 224 * 224);
  167. // conv 1.2
  168. run(128, 64 * 3 * 3, 112 * 112);
  169. // conv 2.1
  170. run(128, 128 * 3 * 3, 112 * 112);
  171. // conv 2.2
  172. run(128, 128 * 3 * 3, 56 * 56);
  173. // conv 3.1
  174. run(256, 128 * 3 * 3, 56 * 56);
  175. // conv 3.2
  176. run(256, 256 * 3 * 3, 28 * 28);
  177. // conv 4.1
  178. run(512, 256 * 3 * 3, 28 * 28);
  179. // conv 4.2
  180. run(512, 512 * 3 * 3, 14 * 14);
  181. }
  182. #endif
  183. #endif
  184. #if MEGDNN_WITH_BENCHMARK
  185. TEST_F(AARCH64, BENCHMARK_CONVOLUTION_STRIDE2) {
  186. using Param = param::Convolution;
  187. auto run = [&](const TensorShapeArray& shapes, Param param) {
  188. Benchmarker<Convolution> benchmarker_float(handle());
  189. size_t RUN = 50;
  190. auto tfloat = benchmarker_float.set_display(false)
  191. .set_dtype(0, dtype::Float32{})
  192. .set_dtype(1, dtype::Float32{})
  193. .set_before_exec_callback(AlgoChecker<Convolution>(
  194. "CONVOLUTION_DEFAULT_ARMV8F32STRD2_LARGE_"
  195. "GROUP"))
  196. .set_times(RUN)
  197. .set_param(param)
  198. .exec(shapes);
  199. size_t IC = shapes[1][1];
  200. size_t FH = shapes[1][2];
  201. size_t FW = shapes[1][3];
  202. TensorLayout dst_layout;
  203. auto opr = handle()->create_operator<Convolution>();
  204. opr->param() = param;
  205. opr->deduce_layout(
  206. {shapes[0], dtype::Float32()}, {shapes[1], dtype::Float32()},
  207. dst_layout);
  208. printf("fp32 flops: %.3f mflops\n",
  209. (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
  210. (tfloat / RUN * 1000));
  211. };
  212. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  213. auto run1 = [&](const TensorShapeArray& shapes, Param param) {
  214. Benchmarker<Convolution> benchmarker_float(handle());
  215. size_t RUN = 50;
  216. auto tfloat = benchmarker_float.set_display(false)
  217. .set_dtype(0, dtype::Float16())
  218. .set_dtype(1, dtype::Float16())
  219. .set_before_exec_callback(AlgoChecker<Convolution>(
  220. "CONVOLUTION_DEFAULT_ARMV8F16STRD2_LARGE_"
  221. "GROUP"))
  222. .set_times(RUN)
  223. .set_param(param)
  224. .exec(shapes);
  225. size_t IC = shapes[1][1];
  226. size_t FH = shapes[1][2];
  227. size_t FW = shapes[1][3];
  228. TensorLayout dst_layout;
  229. auto opr = handle()->create_operator<Convolution>();
  230. opr->param() = param;
  231. opr->deduce_layout(
  232. {shapes[0], dtype::Float16()}, {shapes[1], dtype::Float16()},
  233. dst_layout);
  234. printf("fp16 flops: %.3f mflops\n",
  235. (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
  236. (tfloat / RUN * 1000));
  237. };
  238. #endif
  239. auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  240. size_t stride) {
  241. Param param;
  242. param.stride_h = stride;
  243. param.stride_w = stride;
  244. param.pad_h = kernel / 2;
  245. param.pad_w = kernel / 2;
  246. printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n", oc, ic,
  247. w, h, stride, kernel);
  248. run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
  249. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  250. run1({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
  251. #endif
  252. };
  253. for (size_t kernel : {2, 3, 5, 7}) {
  254. for (size_t ic : {3, 6, 12, 24}) {
  255. for (size_t oc : {3, 6, 12, 24}) {
  256. for (size_t size : {4, 7, 8, 14, 16, 17, 28, 32, 34, 64, 112}) {
  257. profile(oc, ic, size, size, kernel, 2);
  258. }
  259. }
  260. }
  261. }
  262. }
  263. #endif
  264. // vim: syntax=cpp.doxygen