You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

convolution.cpp 9.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. #include "test/aarch64/fixture.h"
  2. #include "test/common/benchmarker.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/convolution.h"
  5. #include "test/common/rng.h"
  6. using namespace megdnn;
  7. using namespace test;
  8. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  9. TEST_F(AARCH64, CONVOLUTION_BACKWARD_DATA_FP16) {
  10. Checker<ConvolutionBackwardData> checker(handle());
  11. using Param = ConvolutionBackwardData::Param;
  12. Param param;
  13. auto run = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc, size_t fh,
  14. size_t fw, size_t stride, size_t padding, size_t group = 1) {
  15. param.pad_h = param.pad_w = padding;
  16. param.stride_h = param.stride_w = stride;
  17. TensorLayout diff = TensorLayout{{n, oc * group, oh, ow}, dtype::Float16()};
  18. TensorLayout grad;
  19. TensorLayout filter;
  20. if (group == 1) {
  21. param.sparse = Param::Sparse::DENSE;
  22. filter = {{oc, ic, fh, fw}, dtype::Float16()};
  23. } else {
  24. param.sparse = Param::Sparse::GROUP;
  25. filter = {{group, oc, ic, fh, fw}, dtype::Float16()};
  26. }
  27. // TensorLayout grad;
  28. {
  29. auto opr = handle()->create_operator<ConvolutionBackwardData>();
  30. opr->param() = param;
  31. opr->deduce_layout(filter, diff, grad);
  32. }
  33. NormalRNG rng(10.f);
  34. checker.set_param(param)
  35. .set_dtype(0, dtype::Float16())
  36. .set_dtype(1, dtype::Float16())
  37. .set_dtype(2, dtype::Float16())
  38. .set_rng(0, &rng)
  39. .set_rng(1, &rng)
  40. .set_epsilon(1e-2)
  41. .set_before_exec_callback(
  42. AlgoChecker<ConvolutionBackwardData>("DeconvMatmul"));
  43. checker.exec(TensorLayoutArray{filter, diff, grad});
  44. };
  45. for (auto mode : {Param::Mode::CONVOLUTION, Param::Mode::CROSS_CORRELATION}) {
  46. param.mode = mode;
  47. run(4, 3, 10, 13, 5, 1, 1, 1, 0, 1);
  48. run(4, 3, 10, 45, 2, 1, 1, 1, 0, 4);
  49. run(2, 3, 9, 12, 2, 4, 6, 1, 0, 1);
  50. run(3, 4, 17, 32, 2, 3, 2, 5, 4, 4);
  51. run(5, 5, 24, 43, 11, 9, 3, 3, 12, 2);
  52. run(2, 3, 20, 33, 3, 5, 7, 4, 15, 2);
  53. run(4, 4, 6, 7, 9, 3, 2, 2, 1, 3);
  54. }
  55. }
  56. #if MEGDNN_WITH_BENCHMARK
  57. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_QUICK_FP16) {
  58. int exec_times = 10;
  59. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  60. benchmarker_gemm.set_times(exec_times);
  61. float mod = 1000 * exec_times / 1e9;
  62. auto run = [&](size_t M, size_t K, size_t N) {
  63. float time = 1.f, perf = 1.f;
  64. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")" << std::endl;
  65. benchmarker_gemm.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  66. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  67. perf = 2.f * M * K * N / time * mod;
  68. std::cout << "gemm fp32, Performance is " << perf << " Gflops" << std::endl;
  69. benchmarker_gemm.set_dtype(0, dtype::Float16()).set_dtype(1, dtype::Float16());
  70. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  71. perf = 2.f * M * K * N / time * mod;
  72. std::cout << "gemm fp16, Performance is " << perf << " Gflops" << std::endl;
  73. };
  74. // run M = K = N
  75. run(32, 32, 32);
  76. run(64, 64, 64);
  77. run(128, 128, 128);
  78. run(256, 256, 256);
  79. run(512, 512, 512);
  80. run(1024, 1024, 1024);
  81. run(2048, 2048, 2048);
  82. }
  83. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_ALL_SIZES_FP16) {
  84. int exec_times = 10;
  85. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  86. benchmarker_gemm.set_times(exec_times);
  87. float mod = 1000 * exec_times / 1e9;
  88. auto run = [&](size_t M, size_t K, size_t N) {
  89. float time = 1.f, perf = 1.f;
  90. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")" << std::endl;
  91. benchmarker_gemm.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  92. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  93. perf = 2.f * M * K * N / time * mod;
  94. std::cout << "gemm fp32, Performance is " << perf << " Gflops" << std::endl;
  95. benchmarker_gemm.set_dtype(0, dtype::Float16()).set_dtype(1, dtype::Float16());
  96. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  97. perf = 2.f * M * K * N / time * mod;
  98. std::cout << "gemm fp16, Performance is " << perf << " Gflops" << std::endl;
  99. };
  100. std::cout << "warm up:\n";
  101. for (int i = 0; i < 50; i++) {
  102. benchmarker_gemm.set_dtype(0, dtype::Float32())
  103. .set_dtype(1, dtype::Float32())
  104. .set_display(false)
  105. .exec({{256, 256}, {256, 256}, {}});
  106. benchmarker_gemm.set_display(true);
  107. }
  108. // run M = K = N
  109. run(8, 8, 8);
  110. run(16, 16, 16);
  111. run(32, 32, 32);
  112. run(64, 64, 64);
  113. run(128, 128, 128);
  114. run(256, 256, 256);
  115. run(512, 512, 512);
  116. run(1024, 1024, 1024);
  117. run(2048, 2048, 2048);
  118. // run sgmev like
  119. run(32, 32, 1);
  120. run(64, 64, 1);
  121. run(128, 128, 1);
  122. run(256, 256, 1);
  123. run(512, 512, 1);
  124. // run M, N >> K
  125. run(32, 16, 32);
  126. run(64, 16, 64);
  127. run(128, 16, 128);
  128. run(256, 16, 256);
  129. run(512, 16, 512);
  130. // run N, K >> M
  131. run(16, 32, 32);
  132. run(16, 64, 64);
  133. run(16, 128, 128);
  134. run(16, 256, 256);
  135. run(16, 512, 512);
  136. // run M >> K, N
  137. run(32, 16, 16);
  138. run(64, 16, 16);
  139. run(128, 16, 16);
  140. run(256, 16, 16);
  141. run(512, 16, 16);
  142. // run K >> M, N
  143. run(16, 32, 16);
  144. run(16, 64, 16);
  145. run(16, 128, 16);
  146. run(16, 256, 16);
  147. run(16, 512, 16);
  148. // run N >> M, K
  149. run(16, 16, 32);
  150. run(16, 16, 64);
  151. run(16, 16, 128);
  152. run(16, 16, 256);
  153. run(16, 16, 512);
  154. // run VGG
  155. // conv 1.1
  156. run(64, 3 * 3 * 3, 224 * 224);
  157. // conv 1.2
  158. run(128, 64 * 3 * 3, 112 * 112);
  159. // conv 2.1
  160. run(128, 128 * 3 * 3, 112 * 112);
  161. // conv 2.2
  162. run(128, 128 * 3 * 3, 56 * 56);
  163. // conv 3.1
  164. run(256, 128 * 3 * 3, 56 * 56);
  165. // conv 3.2
  166. run(256, 256 * 3 * 3, 28 * 28);
  167. // conv 4.1
  168. run(512, 256 * 3 * 3, 28 * 28);
  169. // conv 4.2
  170. run(512, 512 * 3 * 3, 14 * 14);
  171. }
  172. #endif
  173. #endif
  174. #if MEGDNN_WITH_BENCHMARK
  175. TEST_F(AARCH64, BENCHMARK_CONVOLUTION_STRIDE2) {
  176. using Param = param::Convolution;
  177. auto run = [&](const TensorShapeArray& shapes, Param param) {
  178. Benchmarker<Convolution> benchmarker_float(handle());
  179. size_t RUN = 50;
  180. auto tfloat = benchmarker_float.set_display(false)
  181. .set_dtype(0, dtype::Float32{})
  182. .set_dtype(1, dtype::Float32{})
  183. .set_before_exec_callback(AlgoChecker<Convolution>(
  184. "CONVOLUTION_DEFAULT_ARMV8F32STRD2_LARGE_"
  185. "GROUP"))
  186. .set_times(RUN)
  187. .set_param(param)
  188. .exec(shapes);
  189. size_t IC = shapes[1][1];
  190. size_t FH = shapes[1][2];
  191. size_t FW = shapes[1][3];
  192. TensorLayout dst_layout;
  193. auto opr = handle()->create_operator<Convolution>();
  194. opr->param() = param;
  195. opr->deduce_layout(
  196. {shapes[0], dtype::Float32()}, {shapes[1], dtype::Float32()},
  197. dst_layout);
  198. printf("fp32 flops: %.3f mflops\n",
  199. (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
  200. (tfloat / RUN * 1000));
  201. };
  202. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  203. auto run1 = [&](const TensorShapeArray& shapes, Param param) {
  204. Benchmarker<Convolution> benchmarker_float(handle());
  205. size_t RUN = 50;
  206. auto tfloat = benchmarker_float.set_display(false)
  207. .set_dtype(0, dtype::Float16())
  208. .set_dtype(1, dtype::Float16())
  209. .set_before_exec_callback(AlgoChecker<Convolution>(
  210. "CONVOLUTION_DEFAULT_ARMV8F16STRD2_LARGE_"
  211. "GROUP"))
  212. .set_times(RUN)
  213. .set_param(param)
  214. .exec(shapes);
  215. size_t IC = shapes[1][1];
  216. size_t FH = shapes[1][2];
  217. size_t FW = shapes[1][3];
  218. TensorLayout dst_layout;
  219. auto opr = handle()->create_operator<Convolution>();
  220. opr->param() = param;
  221. opr->deduce_layout(
  222. {shapes[0], dtype::Float16()}, {shapes[1], dtype::Float16()},
  223. dst_layout);
  224. printf("fp16 flops: %.3f mflops\n",
  225. (IC * dst_layout.total_nr_elems() * FH * FW * 2) /
  226. (tfloat / RUN * 1000));
  227. };
  228. #endif
  229. auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
  230. size_t stride) {
  231. Param param;
  232. param.stride_h = stride;
  233. param.stride_w = stride;
  234. param.pad_h = kernel / 2;
  235. param.pad_w = kernel / 2;
  236. printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n", oc, ic,
  237. w, h, stride, kernel);
  238. run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
  239. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  240. run1({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
  241. #endif
  242. };
  243. for (size_t kernel : {2, 3, 5, 7}) {
  244. for (size_t ic : {3, 6, 12, 24}) {
  245. for (size_t oc : {3, 6, 12, 24}) {
  246. for (size_t size : {4, 7, 8, 14, 16, 17, 28, 32, 34, 64, 112}) {
  247. profile(oc, ic, size, size, kernel, 2);
  248. }
  249. }
  250. }
  251. }
  252. }
  253. #endif
  254. // vim: syntax=cpp.doxygen