You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. /**
  2. * \file dnn/test/aarch64/convolution.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/aarch64/fixture.h"
  12. #include "test/common/benchmarker.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/convolution.h"
  15. #include "test/common/rng.h"
  16. using namespace megdnn;
  17. using namespace test;
  18. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  19. TEST_F(AARCH64, CONVOLUTION_BACKWARD_DATA_FP16) {
  20. Checker<ConvolutionBackwardData> checker(handle());
  21. using Param = ConvolutionBackwardData::Param;
  22. Param param;
  23. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc,
  24. size_t fh, size_t fw, size_t stride, size_t padding,
  25. size_t group = 1) {
  26. param.pad_h = param.pad_w = padding;
  27. param.stride_h = param.stride_w = stride;
  28. TensorLayout diff =
  29. TensorLayout{{n, oc * group, oh, ow}, dtype::Float16()};
  30. TensorLayout grad;
  31. TensorLayout filter;
  32. if (group == 1) {
  33. param.sparse = Param::Sparse::DENSE;
  34. filter = {{oc, ic, fh, fw}, dtype::Float16()};
  35. } else {
  36. param.sparse = Param::Sparse::GROUP;
  37. filter = {{group, oc, ic, fh, fw}, dtype::Float16()};
  38. }
  39. // TensorLayout grad;
  40. {
  41. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  42. opr->param() = param;
  43. opr->deduce_layout(filter, diff, grad);
  44. }
  45. NormalRNG rng(10.f);
  46. checker.set_param(param)
  47. .set_dtype(0, dtype::Float16())
  48. .set_dtype(1, dtype::Float16())
  49. .set_dtype(2, dtype::Float16())
  50. .set_rng(0, &rng).set_rng(1, &rng)
  51. .set_epsilon(1e-2)
  52. .set_before_exec_callback(
  53. AlgoChecker<ConvolutionBackwardData>("DeconvMatmul"));
  54. checker.exec(TensorLayoutArray{filter, diff, grad});
  55. };
  56. for (auto mode :
  57. {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) {
  58. param.mode = mode;
  59. run(4, 3, 10, 13, 5, 1, 1, 1, 0, 1);
  60. run(4, 3, 10, 45, 2, 1, 1, 1, 0, 4);
  61. run(2, 3, 9, 12, 2, 4, 6, 1, 0, 1);
  62. run(3, 4, 17, 32, 2, 3, 2, 5, 4, 4);
  63. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 2);
  64. run(2, 3, 20, 33, 3, 5, 7, 4, 15, 2);
  65. run(4, 4, 6, 7, 9, 3, 2, 2, 1, 3);
  66. }
  67. }
  68. #if MEGDNN_WITH_BENCHMARK
  69. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_QUICK_FP16) {
  70. int exec_times = 10;
  71. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  72. benchmarker_gemm.set_times(exec_times);
  73. float mod = 1000 * exec_times / 1e9;
  74. auto run = [&](size_t M, size_t K, size_t N) {
  75. float time = 1.f, perf = 1.f;
  76. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")"
  77. << std::endl;
  78. benchmarker_gemm.set_dtype(0, dtype::Float32())
  79. .set_dtype(1, dtype::Float32());
  80. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  81. perf = 2.f * M * K * N / time * mod;
  82. std::cout << "gemm fp32, Performance is " << perf << " Gflops"
  83. << std::endl;
  84. benchmarker_gemm.set_dtype(0, dtype::Float16())
  85. .set_dtype(1, dtype::Float16());
  86. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  87. perf = 2.f * M * K * N / time * mod;
  88. std::cout << "gemm fp16, Performance is " << perf << " Gflops"
  89. << std::endl;
  90. };
  91. // run M = K = N
  92. run(32, 32, 32);
  93. run(64, 64, 64);
  94. run(128, 128, 128);
  95. run(256, 256, 256);
  96. run(512, 512, 512);
  97. run(1024, 1024, 1024);
  98. run(2048, 2048, 2048);
  99. }
  100. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_ALL_SIZES_FP16) {
  101. int exec_times = 10;
  102. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  103. benchmarker_gemm.set_times(exec_times);
  104. float mod = 1000 * exec_times / 1e9;
  105. auto run = [&](size_t M, size_t K, size_t N) {
  106. float time = 1.f, perf = 1.f;
  107. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")"
  108. << std::endl;
  109. benchmarker_gemm.set_dtype(0, dtype::Float32())
  110. .set_dtype(1, dtype::Float32());
  111. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  112. perf = 2.f * M * K * N / time * mod;
  113. std::cout << "gemm fp32, Performance is " << perf << " Gflops"
  114. << std::endl;
  115. benchmarker_gemm.set_dtype(0, dtype::Float16())
  116. .set_dtype(1, dtype::Float16());
  117. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  118. perf = 2.f * M * K * N / time * mod;
  119. std::cout << "gemm fp16, Performance is " << perf << " Gflops"
  120. << std::endl;
  121. };
  122. std::cout << "warm up:\n";
  123. for (int i = 0; i < 50; i++) {
  124. benchmarker_gemm.set_dtype(0, dtype::Float32())
  125. .set_dtype(1, dtype::Float32())
  126. .set_display(false)
  127. .exec({{256, 256}, {256, 256}, {}});
  128. benchmarker_gemm.set_display(true);
  129. }
  130. // run M = K = N
  131. run(8, 8, 8);
  132. run(16, 16, 16);
  133. run(32, 32, 32);
  134. run(64, 64, 64);
  135. run(128, 128, 128);
  136. run(256, 256, 256);
  137. run(512, 512, 512);
  138. run(1024, 1024, 1024);
  139. run(2048, 2048, 2048);
  140. // run sgmev like
  141. run(32, 32, 1);
  142. run(64, 64, 1);
  143. run(128, 128, 1);
  144. run(256, 256, 1);
  145. run(512, 512, 1);
  146. // run M, N >> K
  147. run(32, 16, 32);
  148. run(64, 16, 64);
  149. run(128, 16, 128);
  150. run(256, 16, 256);
  151. run(512, 16, 512);
  152. // run N, K >> M
  153. run(16, 32, 32);
  154. run(16, 64, 64);
  155. run(16, 128, 128);
  156. run(16, 256, 256);
  157. run(16, 512, 512);
  158. // run M >> K, N
  159. run(32, 16, 16);
  160. run(64, 16, 16);
  161. run(128, 16, 16);
  162. run(256, 16, 16);
  163. run(512, 16, 16);
  164. // run K >> M, N
  165. run(16, 32, 16);
  166. run(16, 64, 16);
  167. run(16, 128, 16);
  168. run(16, 256, 16);
  169. run(16, 512, 16);
  170. // run N >> M, K
  171. run(16, 16, 32);
  172. run(16, 16, 64);
  173. run(16, 16, 128);
  174. run(16, 16, 256);
  175. run(16, 16, 512);
  176. // run VGG
  177. // conv 1.1
  178. run(64, 3 * 3 * 3, 224 * 224);
  179. // conv 1.2
  180. run(128, 64 * 3 * 3, 112 * 112);
  181. // conv 2.1
  182. run(128, 128 * 3 * 3, 112 * 112);
  183. // conv 2.2
  184. run(128, 128 * 3 * 3, 56 * 56);
  185. // conv 3.1
  186. run(256, 128 * 3 * 3, 56 * 56);
  187. // conv 3.2
  188. run(256, 256 * 3 * 3, 28 * 28);
  189. // conv 4.1
  190. run(512, 256 * 3 * 3, 28 * 28);
  191. // conv 4.2
  192. run(512, 512 * 3 * 3, 14 * 14);
  193. }
  194. #endif
  195. #endif
  196. #if MEGDNN_WITH_BENCHMARK
  197. TEST_F(AARCH64, BENCHMARK_CONVOLUTION_STRIDE2) {
  198. using Param = param::Convolution;
  199. auto run = [&](const TensorShapeArray& shapes, Param param) {
  200. Benchmarker<Convolution> benchmarker_float(handle());
  201. size_t RUN = 50;
  202. auto tfloat =
  203. benchmarker_float.set_display(false)
  204. .set_dtype(0, dtype::Float32{})
  205. .set_dtype(1, dtype::Float32{})
  206. .set_before_exec_callback(AlgoChecker<Convolution>(
  207. "CONVOLUTION_DEFAULT_ARMV8F32STRD2_LARGE_"
  208. "GROUP"))
  209. .set_times(RUN)
  210. .set_param(param)
  211. .exec(shapes);
  212. size_t IC = shapes[1][1];
  213. size_t FH = shapes[1][2];
  214. size_t FW = shapes[1][3];
  215. TensorLayout dst_layout;
  216. auto opr = handle()->create_operator<Convolution>();
  217. opr->param() = param;
  218. opr->deduce_layout({shapes[0], dtype::Float32()},
  219. {shapes[1], dtype::Float32()}, dst_layout);
  220. printf("fp32 flops: %.3f mflops\n",
  221. (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
  222. (tfloat / RUN * 1000));
  223. };
  224. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  225. auto run1 = [&](const TensorShapeArray& shapes, Param param) {
  226. Benchmarker<Convolution> benchmarker_float(handle());
  227. size_t RUN = 50;
  228. auto tfloat =
  229. benchmarker_float.set_display(false)
  230. .set_dtype(0, dtype::Float16())
  231. .set_dtype(1, dtype::Float16())
  232. .set_before_exec_callback(AlgoChecker<Convolution>(
  233. "CONVOLUTION_DEFAULT_ARMV8F16STRD2_LARGE_"
  234. "GROUP"))
  235. .set_times(RUN)
  236. .set_param(param)
  237. .exec(shapes);
  238. size_t IC = shapes[1][1];
  239. size_t FH = shapes[1][2];
  240. size_t FW = shapes[1][3];
  241. TensorLayout dst_layout;
  242. auto opr = handle()->create_operator<Convolution>();
  243. opr->param() = param;
  244. opr->deduce_layout({shapes[0], dtype::Float16()},
  245. {shapes[1], dtype::Float16()}, dst_layout);
  246. printf("fp16 flops: %.3f mflops\n",
  247. (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
  248. (tfloat / RUN * 1000));
  249. };
  250. #endif
  251. auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  252. size_t stride) {
  253. Param param;
  254. param.stride_h = stride;
  255. param.stride_w = stride;
  256. param.pad_h = kernel / 2;
  257. param.pad_w = kernel / 2;
  258. printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n",
  259. oc, ic, w, h, stride, kernel);
  260. run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
  261. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  262. run1({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
  263. #endif
  264. };
  265. for (size_t kernel : {2, 3, 5, 7}) {
  266. for (size_t ic : {3, 6, 12, 24}) {
  267. for (size_t oc : {3, 6, 12, 24}) {
  268. for (size_t size : {4, 7, 8, 14, 16, 17, 28, 32, 34, 64, 112}) {
  269. profile(oc, ic, size, size, kernel, 2);
  270. }
  271. }
  272. }
  273. }
  274. }
  275. #endif
  276. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台