You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785
  1. /**
  2. * \file dnn/test/aarch64/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/aarch64/fixture.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/matrix_mul.h"
  16. #include "test/common/rng.h"
  17. #include "test/arm_common/cpuinfo_help.h"
  18. using namespace megdnn;
  19. using namespace test;
  20. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12) {
  21. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  22. dtype::Float32{}, handle(),
  23. "AARCH64_F32K8X12X1");
  24. }
  25. #if MGB_ENABLE_CPUINFO
  26. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12_A53) {
  27. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a53);
  28. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  29. dtype::Float32{}, handle(),
  30. "AARCH64_F32K8X12X1");
  31. }
  32. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12_A55) {
  33. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a55);
  34. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  35. dtype::Float32{}, handle(),
  36. "AARCH64_F32K8X12X1");
  37. }
  38. #endif
  39. TEST_F(AARCH64, MATRIX_MUL_FP32K4X16) {
  40. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  41. dtype::Float32{}, handle(),
  42. "AARCH64_F32K4X16X1");
  43. }
  44. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4) {
  45. matrix_mul::check_matrix_mul(
  46. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  47. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  48. }
  49. #if MGB_ENABLE_CPUINFO
  50. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4_A53) {
  51. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a53);
  52. matrix_mul::check_matrix_mul(
  53. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  54. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  55. }
  56. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4_A55) {
  57. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a55);
  58. matrix_mul::check_matrix_mul(
  59. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  60. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  61. }
  62. #endif
  63. TEST_F(AARCH64, MATRIX_MUL_FP32_MK4) {
  64. matrix_mul::check_matrix_mul(
  65. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  66. "AARCH64_F32_MK4_4x16", param::MatrixMul::Format::MK4, 1);
  67. }
  68. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  69. TEST_F(AARCH64, MATRIX_MUL_F16_K8X24X1) {
  70. matrix_mul::check_matrix_mul(dtype::Float16{}, dtype::Float16{},
  71. dtype::Float16{}, handle(),
  72. "AARCH64_F16_K8X24X1");
  73. }
  74. TEST_F(AARCH64, MATRIX_MUL_F16_MK8) {
  75. matrix_mul::check_matrix_mul(
  76. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  77. "AARCH64_F16_MK8_8X8", param::MatrixMul::Format::MK8, 1);
  78. }
  79. #endif
  80. #if __ARM_FEATURE_DOTPROD
  81. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K8X12X4_DOTPROD) {
  82. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  83. handle(), "AARCH64_INT8X8X32_K8X12X4_DOTPROD");
  84. }
  85. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_MK4_8X12X4_DOTPROD) {
  86. std::vector<matrix_mul::TestArg> args;
  87. for (size_t m : {1, 2, 3, 4, 5, 6, 7, 10, 11})
  88. for (size_t n : {2, 3, 4, 5, 8, 12, 13, 14, 15, 16, 31})
  89. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  90. args.emplace_back(m, n, k, 0);
  91. matrix_mul::check_matrix_mul(
  92. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  93. "AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD",
  94. param::MatrixMul::Format::MK4_DOT, 1, 1e-3, std::move(args));
  95. }
  96. #else
  97. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K4X4X16) {
  98. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  99. handle(), "AARCH64_INT8X8X32_K4X4X16");
  100. }
  101. TEST_F(AARCH64, MATRIX_MUL_INT8_MK4) {
  102. std::vector<matrix_mul::TestArg> args;
  103. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  104. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  105. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  106. args.emplace_back(m, n, k, 0);
  107. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  108. handle(), "AARCH64_INT8X8X32_MK4_4X4X16",
  109. param::MatrixMul::Format::MK4, 1, 1e-3,
  110. std::move(args));
  111. }
  112. TEST_F(AARCH64, MATRIX_MUL_MK4_8x8x16_4x4) {
  113. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  114. handle(), "AARCH64_INT8X8X16_MK4_4X4X8",
  115. param::MatrixMul::Format::MK4, 1);
  116. }
  117. TEST_F(AARCH64, MATRIX_MUL_MK4_8x8x16) {
  118. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  119. handle(), "AARCH64_INT8X8X16_MK4_16X12X4",
  120. param::MatrixMul::Format::MK4, 1);
  121. }
  122. TEST_F(AARCH64, MATRIX_MUL_INT8x8x32_K8x8x8) {
  123. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  124. handle(), "AARCH64_INT8X8X32_K8X8X8");
  125. }
  126. #endif
  127. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K8x8x8) {
  128. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  129. handle(), "AARCH64_INT8X8X16_K8X8X8");
  130. }
  131. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K4x4x16) {
  132. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  133. handle(), "AARCH64_INT8X8X16_K4X4X16");
  134. }
  135. TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_K12X8X1) {
  136. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  137. handle(), "AARCH64_INT16X16X32_K12X8X1");
  138. }
  139. TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_MK8) {
  140. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  141. handle(), "AARCH64_INT16X16X32_MK8_8X8",
  142. param::MatrixMul::Format::MK8, 1);
  143. }
  144. //! FIXME: need to add tests of GEMV and QUINT8
  145. #if MEGDNN_WITH_BENCHMARK
  146. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K4X16) {
  147. constexpr size_t RUNS = 50;
  148. param::MatrixMul param;
  149. param.transposeA = false;
  150. param.transposeB = false;
  151. Benchmarker<MatrixMul> benchmarker_K4X16(handle());
  152. Benchmarker<MatrixMul> benchmarker_K12X8(handle());
  153. benchmarker_K4X16.set_times(RUNS)
  154. .set_dtype(0, dtype::Float32{})
  155. .set_dtype(1, dtype::Float32{})
  156. .set_dtype(2, dtype::Float32{})
  157. .set_param(param)
  158. .set_display(false);
  159. benchmarker_K4X16.set_before_exec_callback(
  160. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  161. benchmarker_K12X8.set_before_exec_callback(
  162. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  163. benchmarker_K12X8.set_times(RUNS)
  164. .set_dtype(0, dtype::Float32{})
  165. .set_dtype(1, dtype::Float32{})
  166. .set_dtype(2, dtype::Float32{})
  167. .set_param(param)
  168. .set_display(false);
  169. auto run = [&](size_t M, size_t N, size_t K) {
  170. TensorShape A, B;
  171. if (param.transposeA) {
  172. A = TensorShape{K, M};
  173. } else {
  174. A = TensorShape{M, K};
  175. }
  176. if (param.transposeB) {
  177. B = TensorShape{N, K};
  178. } else {
  179. B = TensorShape{K, N};
  180. }
  181. auto k4x16_used = benchmarker_K4X16.exec({A, B, {}}) / RUNS;
  182. auto k12x8_used = benchmarker_K12X8.exec({A, B, {}}) / RUNS;
  183. float computations = 2.f * M * K * N * 1e-6;
  184. printf("run: {%zu{M} %zu{K} %zu{N}} k4x16: %f ms %f Gflops k12x8: %f "
  185. "ms "
  186. "%f Gflops k4x16_vs_k12x8: %f\n",
  187. M, K, N, k4x16_used, computations / k4x16_used, k12x8_used,
  188. computations / k12x8_used, k12x8_used / k4x16_used);
  189. };
  190. run(256, 256, 128);
  191. run(384, 384, 384);
  192. for (size_t k = 4; k <= 256; k *= 8) {
  193. for (size_t m = 4; m <= 256; m *= 4) {
  194. for (size_t n = 4; n <= 256; n *= 4) {
  195. run(m, n, k);
  196. }
  197. printf("\n");
  198. }
  199. printf("\n");
  200. }
  201. }
  202. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_8X8X8) {
  203. constexpr size_t RUNS = 50;
  204. param::MatrixMul param;
  205. param.transposeA = false;
  206. param.transposeB = false;
  207. Benchmarker<MatrixMul> benchmarker_int(handle());
  208. Benchmarker<MatrixMul> benchmarker_int32(handle());
  209. benchmarker_int.set_times(RUNS)
  210. .set_dtype(0, dtype::Int8{})
  211. .set_dtype(1, dtype::Int8{})
  212. .set_dtype(2, dtype::Int16{})
  213. .set_param(param)
  214. .set_display(false);
  215. benchmarker_int.set_before_exec_callback(
  216. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K8X8X8"));
  217. benchmarker_int32.set_before_exec_callback(
  218. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X8X8"));
  219. benchmarker_int32.set_times(RUNS)
  220. .set_dtype(0, dtype::Int8{})
  221. .set_dtype(1, dtype::Int8{})
  222. .set_dtype(2, dtype::Int32{})
  223. .set_param(param)
  224. .set_display(false);
  225. Benchmarker<MatrixMul> benchmarker_float(handle());
  226. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  227. auto run = [&](size_t M, size_t N, size_t K) {
  228. TensorShape A, B;
  229. if (param.transposeA) {
  230. A = TensorShape{K, M};
  231. } else {
  232. A = TensorShape{M, K};
  233. }
  234. if (param.transposeB) {
  235. B = TensorShape{N, K};
  236. } else {
  237. B = TensorShape{K, N};
  238. }
  239. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  240. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  241. auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
  242. float computations = 2.f * M * K * N * 1e-6;
  243. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  244. "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
  245. M, K, N, float_used, computations / float_used, int_used,
  246. computations / int_used, float_used / int_used,
  247. int32_used / int_used);
  248. };
  249. run(256, 256, 256);
  250. for (size_t k = 4; k <= 256; k *= 8) {
  251. for (size_t m = 4; m <= 256; m *= 4) {
  252. for (size_t n = 4; n <= 256; n *= 4) {
  253. run(m, n, k);
  254. }
  255. std::cout << std::endl;
  256. }
  257. std::cout << std::endl;
  258. }
  259. }
  260. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT32_MK_4X4X16) {
  261. constexpr size_t RUNS = 50;
  262. param::MatrixMul param;
  263. param.transposeA = false;
  264. param.transposeB = false;
  265. Benchmarker<MatrixMul> benchmarker(handle());
  266. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  267. benchmarker.set_times(RUNS)
  268. .set_dtype(0, dtype::Int8{})
  269. .set_dtype(1, dtype::Int8{})
  270. .set_dtype(2, dtype::Int32{})
  271. .set_param(param)
  272. .set_display(false);
  273. benchmarker.set_before_exec_callback(
  274. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
  275. param.format = MatrixMul::Param::Format::MK4;
  276. benchmarker_mk4.set_before_exec_callback(
  277. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_4X4X16"));
  278. benchmarker_mk4.set_times(RUNS)
  279. .set_dtype(0, dtype::Int8{})
  280. .set_dtype(1, dtype::Int8{})
  281. .set_dtype(2, dtype::Int32{})
  282. .set_param(param)
  283. .set_display(false);
  284. auto run = [&](size_t M, size_t N, size_t K) {
  285. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  286. auto mk_used = benchmarker_mk4.exec(
  287. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  288. RUNS;
  289. float computations = 2.f * M * K * N * 1e-6;
  290. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  291. "%f Gflops speedup_vs_normal: %f\n",
  292. M, K, N, default_used, computations / default_used, mk_used,
  293. computations / mk_used, default_used / mk_used);
  294. };
  295. run(256, 256, 128);
  296. for (size_t k = 4; k <= 512; k *= 2) {
  297. for (size_t m = 4; m <= 512; m *= 2) {
  298. for (size_t n = 4; n <= 512; n *= 2) {
  299. run(m, n, k);
  300. }
  301. }
  302. std::cout << std::endl;
  303. }
  304. }
  305. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_MK4_8x8x16) {
  306. constexpr size_t RUNS = 50;
  307. param::MatrixMul param;
  308. param.transposeA = false;
  309. param.transposeB = false;
  310. Benchmarker<MatrixMul> benchmarker(handle());
  311. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  312. Benchmarker<MatrixMul> benchmarker_mk4_16x12(handle());
  313. benchmarker.set_times(RUNS)
  314. .set_dtype(0, dtype::Int8{})
  315. .set_dtype(1, dtype::Int8{})
  316. .set_dtype(2, dtype::Int16{})
  317. .set_param(param)
  318. .set_display(false);
  319. benchmarker.set_before_exec_callback(
  320. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  321. param.format = MatrixMul::Param::Format::MK4;
  322. benchmarker_mk4.set_before_exec_callback(
  323. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_4X4X8"));
  324. benchmarker_mk4.set_times(RUNS)
  325. .set_dtype(0, dtype::Int8{})
  326. .set_dtype(1, dtype::Int8{})
  327. .set_dtype(2, dtype::Int16{})
  328. .set_param(param)
  329. .set_display(false);
  330. benchmarker_mk4_16x12.set_before_exec_callback(
  331. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_16X12X4"));
  332. benchmarker_mk4_16x12.set_times(RUNS)
  333. .set_dtype(0, dtype::Int8{})
  334. .set_dtype(1, dtype::Int8{})
  335. .set_dtype(2, dtype::Int16{})
  336. .set_param(param)
  337. .set_display(false);
  338. auto run = [&](size_t M, size_t N, size_t K) {
  339. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  340. auto mk_used = benchmarker_mk4.exec(
  341. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  342. RUNS;
  343. auto mk4_16x12_used =
  344. benchmarker_mk4_16x12.exec(
  345. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  346. RUNS;
  347. float computations = 2.f * M * K * N * 1e-6;
  348. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  349. "%f Gflops speedup: %f, mk4_16x12 %f Gflops speedup: %f\n",
  350. M, K, N, default_used, computations / default_used, mk_used,
  351. computations / mk_used, default_used / mk_used,
  352. computations / mk4_16x12_used, default_used / mk4_16x12_used);
  353. };
  354. run(384, 384, 384);
  355. }
  356. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_4X4X16) {
  357. constexpr size_t RUNS = 50;
  358. param::MatrixMul param;
  359. param.transposeA = false;
  360. param.transposeB = false;
  361. Benchmarker<MatrixMul> benchmarker_int(handle());
  362. Benchmarker<MatrixMul> benchmarker_int32(handle());
  363. benchmarker_int.set_times(RUNS)
  364. .set_dtype(0, dtype::Int8{})
  365. .set_dtype(1, dtype::Int8{})
  366. .set_dtype(2, dtype::Int16{})
  367. .set_param(param)
  368. .set_display(false);
  369. benchmarker_int.set_before_exec_callback(
  370. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  371. benchmarker_int32.set_before_exec_callback(
  372. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
  373. benchmarker_int32.set_times(RUNS)
  374. .set_dtype(0, dtype::Int8{})
  375. .set_dtype(1, dtype::Int8{})
  376. .set_dtype(2, dtype::Int32{})
  377. .set_param(param)
  378. .set_display(false);
  379. Benchmarker<MatrixMul> benchmarker_float(handle());
  380. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  381. auto run = [&](size_t M, size_t N, size_t K) {
  382. TensorShape A, B;
  383. if (param.transposeA) {
  384. A = TensorShape{K, M};
  385. } else {
  386. A = TensorShape{M, K};
  387. }
  388. if (param.transposeB) {
  389. B = TensorShape{N, K};
  390. } else {
  391. B = TensorShape{K, N};
  392. }
  393. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  394. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  395. auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
  396. float computations = 2.f * M * K * N * 1e-6;
  397. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  398. "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
  399. M, K, N, float_used, computations / float_used, int_used,
  400. computations / int_used, float_used / int_used,
  401. int32_used / int_used);
  402. };
  403. run(256, 256, 128);
  404. run(256, 256, 256);
  405. for (size_t k = 4; k <= 256; k *= 4) {
  406. for (size_t m = 4; m <= 256; m *= 4) {
  407. for (size_t n = 4; n <= 256; n *= 4) {
  408. run(m, n, k);
  409. }
  410. }
  411. std::cout << std::endl;
  412. }
  413. }
  414. TEST_F(AARCH64, BENCHMARK_GEMV) {
  415. int exec_times = 10;
  416. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  417. benchmarker_gemm.set_times(exec_times);
  418. float mod = 1000 * exec_times / 1e9;
  419. auto run = [&](size_t M, size_t K, size_t N) {
  420. float time = 1.f, perf = 1.f;
  421. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")"
  422. << std::endl;
  423. benchmarker_gemm.set_dtype(0, dtype::Float32())
  424. .set_dtype(1, dtype::Float32());
  425. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  426. perf = 2.f * M * K * N / time * mod;
  427. std::cout << "gemm fp32, Performance is " << perf << " Gflops"
  428. << std::endl;
  429. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  430. benchmarker_gemm.set_dtype(0, dtype::Float16())
  431. .set_dtype(1, dtype::Float16());
  432. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  433. perf = 2.f * M * K * N / time * mod;
  434. std::cout << "gemm fp16, Performance is " << perf << " Gflops"
  435. << std::endl;
  436. #endif
  437. };
  438. std::cout << "warm up:\n";
  439. for (int i = 0; i < 50; i++) {
  440. benchmarker_gemm.set_dtype(0, dtype::Float32())
  441. .set_dtype(1, dtype::Float32())
  442. .set_display(false)
  443. .exec({{256, 256}, {256, 256}, {}});
  444. benchmarker_gemm.set_display(true);
  445. }
  446. // run gemv
  447. for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
  448. for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
  449. for (size_t N : {112})
  450. run(M, K, N);
  451. }
  452. #if __ARM_FEATURE_DOTPROD
  453. TEST_F(AARCH64, BENCHMARK_TRANSPOSED_MATRIX_MUL_INT_8X8X32) {
  454. constexpr size_t RUNS = 50;
  455. param::MatrixMul param;
  456. param.transposeA = param.transposeB = true;
  457. Benchmarker<MatrixMul> benchmarker_int(handle());
  458. benchmarker_int.set_times(RUNS)
  459. .set_dtype(0, dtype::Int8{})
  460. .set_dtype(1, dtype::Int8{})
  461. .set_dtype(2, {})
  462. .set_param(param)
  463. .set_display(false);
  464. Benchmarker<MatrixMul> benchmarker_float(handle());
  465. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  466. auto run = [&](size_t M, size_t N, size_t K) {
  467. auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
  468. auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
  469. float computations = 2.f * M * K * N * 1e-6;
  470. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  471. "%f Gflops speedup: %f\n",
  472. M, K, N, float_used, computations / float_used, int_used,
  473. computations / int_used, float_used / int_used);
  474. };
  475. run(256, 12 * 24, 256);
  476. for (size_t M : {8, 64, 112, 256}) {
  477. for (size_t K : {8, 64, 112, 256}) {
  478. for (size_t N : {8, 64, 112, 256}) {
  479. run(M, N, K);
  480. }
  481. }
  482. }
  483. }
  484. TEST_F(AARCH64, BENCHMARK_GEMV_INT_8X8X32) {
  485. constexpr size_t RUNS = 50;
  486. param::MatrixMul param;
  487. Benchmarker<MatrixMul> benchmarker_int(handle());
  488. benchmarker_int.set_times(RUNS)
  489. .set_dtype(0, dtype::Int8{})
  490. .set_dtype(1, dtype::Int8{})
  491. .set_dtype(2, {})
  492. .set_param(param)
  493. .set_display(false);
  494. Benchmarker<MatrixMul> benchmarker_float(handle());
  495. benchmarker_float.set_display(false).set_times(RUNS);
  496. auto run = [&](size_t M, size_t N, size_t K) {
  497. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  498. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  499. float computations = 2.f * M * K * N * 1e-6;
  500. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  501. "%f Gflops speedup: %f\n",
  502. M, K, N, float_used, computations / float_used, int_used,
  503. computations / int_used, float_used / int_used);
  504. };
  505. for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  506. for (size_t N : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  507. for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  508. run(M, N, K);
  509. }
  510. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT8X8X32_MK4_8X12X4) {
  511. constexpr size_t RUNS = 50;
  512. param::MatrixMul param;
  513. param.transposeA = false;
  514. param.transposeB = false;
  515. Benchmarker<MatrixMul> benchmarker(handle());
  516. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  517. benchmarker.set_times(RUNS)
  518. .set_dtype(0, dtype::Int8{})
  519. .set_dtype(1, dtype::Int8{})
  520. .set_dtype(2, dtype::Int32{})
  521. .set_param(param)
  522. .set_display(false);
  523. benchmarker.set_before_exec_callback(
  524. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X12X4"));
  525. param.format = MatrixMul::Param::Format::MK4_DOT;
  526. benchmarker_mk4.set_before_exec_callback(
  527. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD"));
  528. benchmarker_mk4.set_times(RUNS)
  529. .set_dtype(0, dtype::Int8{})
  530. .set_dtype(1, dtype::Int8{})
  531. .set_dtype(2, dtype::Int32{})
  532. .set_param(param)
  533. .set_display(false);
  534. auto run = [&](size_t M, size_t N, size_t K) {
  535. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  536. auto mk_used = benchmarker_mk4.exec(
  537. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  538. RUNS;
  539. float computations = 2.f * M * K * N * 1e-6;
  540. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  541. "%f Gflops speedup_vs_normal: %f\n",
  542. M, K, N, default_used, computations / default_used, mk_used,
  543. computations / mk_used, default_used / mk_used);
  544. };
  545. run(256, 256, 128);
  546. for (size_t k = 4; k <= 512; k *= 2) {
  547. for (size_t m = 4; m <= 512; m *= 2) {
  548. for (size_t n = 4; n <= 512; n *= 2) {
  549. run(m, n, k);
  550. }
  551. }
  552. std::cout << std::endl;
  553. }
  554. }
  555. #endif // __ARM_FEATURE_DOTPROD
  556. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  557. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_F16_MK8) {
  558. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  559. matrix_mul::benchmark_with_contrast(
  560. handle(), args, dtype::Float16{}, dtype::Float16{},
  561. dtype::Float16{}, "AARCH64_F16_MK8_8X8",
  562. param::MatrixMul::Format::MK8, dtype::Float16{}, dtype::Float16{},
  563. dtype::Float16{}, "AARCH64_F16_K8X24X1");
  564. }
  565. #endif
  566. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32) {
  567. constexpr size_t RUNS = 50;
  568. Benchmarker<MatrixMul> benchmarker_int(handle());
  569. benchmarker_int.set_times(RUNS)
  570. .set_dtype(0, dtype::Int16{})
  571. .set_dtype(1, dtype::Int16{})
  572. .set_dtype(2, dtype::Int32{})
  573. .set_display(false);
  574. Benchmarker<MatrixMul> benchmarker_float(handle());
  575. benchmarker_float.set_display(false).set_times(RUNS);
  576. auto run = [&](size_t M, size_t N, size_t K, int mask) {
  577. param::MatrixMul param;
  578. param.transposeA = mask & 0x1;
  579. param.transposeB = mask & 0x2;
  580. benchmarker_int.set_param(param);
  581. benchmarker_float.set_param(param);
  582. TensorShape A, B;
  583. if (param.transposeA) {
  584. A = TensorShape{K, M};
  585. } else {
  586. A = TensorShape{M, K};
  587. }
  588. if (param.transposeB) {
  589. B = TensorShape{N, K};
  590. } else {
  591. B = TensorShape{K, N};
  592. }
  593. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  594. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  595. float computations = 2.f * M * K * N * 1e-6;
  596. printf("run: {%zu{M} %zu{K} %zu{N} %d{TA} %d{TB}} "
  597. "float: %f ms %f Gflops int: %f ms "
  598. "%f Gflops speedup: %f\n",
  599. M, K, N, param.transposeA, param.transposeB, float_used,
  600. computations / float_used, int_used, computations / int_used,
  601. float_used / int_used);
  602. };
  603. constexpr int mask = 4;
  604. for (auto i = 0; i < mask; i++) {
  605. for (size_t M : {8, 64, 112, 256}) {
  606. for (size_t K : {8, 64, 112, 256}) {
  607. for (size_t N : {8, 64, 112, 256}) {
  608. run(M, N, K, i);
  609. }
  610. }
  611. }
  612. }
  613. }
  614. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_MK4) {
  615. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
  616. matrix_mul::benchmark_with_contrast(
  617. handle(), args, dtype::Float32{}, dtype::Float32{},
  618. dtype::Float32{}, "AARCH64_F32_MK4_4x16",
  619. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  620. dtype::Float32{});
  621. }
  622. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_PACK_MK4) {
  623. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
  624. matrix_mul::benchmark_with_contrast(
  625. handle(), args, dtype::Float32{}, dtype::Float32{},
  626. dtype::Float32{}, "AARCH64_F32_MK4_K8X12X1",
  627. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  628. dtype::Float32{}, "AARCH64_F32K8X12X1");
  629. }
  630. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
  631. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  632. matrix_mul::benchmark_with_contrast(
  633. handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  634. "AARCH64_INT16X16X32_MK8_8X8", param::MatrixMul::Format::MK8,
  635. dtype::Int16{}, dtype::Int16{}, dtype::Int32{});
  636. }
  637. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12) {
  638. constexpr size_t RUNS = 50;
  639. param::MatrixMul param;
  640. param.transposeA = param.transposeB = true;
  641. Benchmarker<MatrixMul> benchmarker_k12x8(handle());
  642. Benchmarker<MatrixMul> benchmarker_k8x12(handle());
  643. benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
  644. benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
  645. benchmarker_k12x8.set_before_exec_callback(
  646. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  647. benchmarker_k8x12.set_before_exec_callback(
  648. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  649. auto run = [&](size_t M, size_t N, size_t K) {
  650. auto k12x8_used = benchmarker_k12x8.exec({{K, M}, {N, K}, {}}) / RUNS;
  651. auto k8x12_used = benchmarker_k8x12.exec({{K, M}, {N, K}, {}}) / RUNS;
  652. float computations = 2.f * M * K * N * 1e-6;
  653. printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
  654. "k8x12: %f ms "
  655. "%f Gflops speedup: %f\n",
  656. M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
  657. computations / k8x12_used, k12x8_used / k8x12_used);
  658. };
  659. run(256, 12 * 24, 256);
  660. for (size_t M : {8, 64, 112, 256}) {
  661. for (size_t K : {8, 64, 112, 256}) {
  662. for (size_t N : {8, 64, 112, 256}) {
  663. run(M, N, K);
  664. }
  665. }
  666. }
  667. }
  668. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12_NO_TRANS) {
  669. constexpr size_t RUNS = 50;
  670. param::MatrixMul param;
  671. param.transposeA = param.transposeB = false;
  672. Benchmarker<MatrixMul> benchmarker_k12x8(handle());
  673. Benchmarker<MatrixMul> benchmarker_k8x12(handle());
  674. benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
  675. benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
  676. benchmarker_k12x8.set_before_exec_callback(
  677. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  678. benchmarker_k8x12.set_before_exec_callback(
  679. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  680. auto run = [&](size_t M, size_t N, size_t K) {
  681. auto k12x8_used = benchmarker_k12x8.exec({{M, K}, {K, N}, {}}) / RUNS;
  682. auto k8x12_used = benchmarker_k8x12.exec({{M, K}, {K, N}, {}}) / RUNS;
  683. float computations = 2.f * M * K * N * 1e-6;
  684. printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
  685. "k8x12: %f ms "
  686. "%f Gflops speedup: %f\n",
  687. M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
  688. computations / k8x12_used, k12x8_used / k8x12_used);
  689. };
  690. run(256, 12 * 24, 256);
  691. for (size_t M : {8, 64, 112, 256}) {
  692. for (size_t K : {8, 64, 112, 256}) {
  693. for (size_t N : {8, 64, 112, 256}) {
  694. run(M, N, K);
  695. }
  696. }
  697. }
  698. }
  699. #endif // MEGDNN_WITH_BENCHMARK
  700. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台