You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684
  1. /**
  2. * \file dnn/test/aarch64/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/aarch64/fixture.h"
  12. #include "test/common/benchmarker.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/matrix_mul.h"
  15. #include "test/common/rng.h"
  16. using namespace megdnn;
  17. using namespace test;
  18. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12) {
  19. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  20. dtype::Float32{}, handle(),
  21. "AARCH64_F32K8X12X1");
  22. }
  23. TEST_F(AARCH64, MATRIX_MUL_FP32K4X16) {
  24. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  25. dtype::Float32{}, handle(),
  26. "AARCH64_F32K4X16X1");
  27. }
  28. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4) {
  29. matrix_mul::check_matrix_mul(
  30. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  31. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  32. }
  33. TEST_F(AARCH64, MATRIX_MUL_FP32_MK4) {
  34. matrix_mul::check_matrix_mul(
  35. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  36. "AARCH64_F32_MK4_4x16", param::MatrixMul::Format::MK4, 1);
  37. }
  38. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  39. TEST_F(AARCH64, MATRIX_MUL_F16_K8X24X1) {
  40. matrix_mul::check_matrix_mul(dtype::Float16{}, dtype::Float16{},
  41. dtype::Float16{}, handle(),
  42. "AARCH64_F16_K8X24X1");
  43. }
  44. TEST_F(AARCH64, MATRIX_MUL_F16_MK8) {
  45. matrix_mul::check_matrix_mul(
  46. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  47. "AARCH64_F16_MK8_8X8", param::MatrixMul::Format::MK8, 1);
  48. }
  49. #endif
  50. #if __ARM_FEATURE_DOTPROD
  51. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K8X12X4_DOTPROD) {
  52. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  53. handle(), "AARCH64_INT8X8X32_K8X12X4_DOTPROD");
  54. }
  55. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_MK4_8X12X4_DOTPROD) {
  56. std::vector<matrix_mul::TestArg> args;
  57. for (size_t m : {1, 2, 3, 4, 5, 6, 7, 10, 11})
  58. for (size_t n : {2, 3, 4, 5, 8, 12, 13, 14, 15, 16, 31})
  59. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  60. args.emplace_back(m, n, k, 0);
  61. matrix_mul::check_matrix_mul(
  62. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  63. "AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD",
  64. param::MatrixMul::Format::MK4_DOT, 1, 1e-3, std::move(args));
  65. }
  66. #else
  67. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K4X4X16) {
  68. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  69. handle(), "AARCH64_INT8X8X32_K4X4X16");
  70. }
  71. TEST_F(AARCH64, MATRIX_MUL_INT8_MK4) {
  72. std::vector<matrix_mul::TestArg> args;
  73. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  74. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  75. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  76. args.emplace_back(m, n, k, 0);
  77. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  78. handle(), "AARCH64_INT8X8X32_MK4_4X4X16",
  79. param::MatrixMul::Format::MK4, 1, 1e-3,
  80. std::move(args));
  81. }
  82. TEST_F(AARCH64, MATRIX_MUL_INT8x8x32_K8x8x8) {
  83. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  84. handle(), "AARCH64_INT8X8X32_K8X8X8");
  85. }
  86. #endif
  87. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K8x8x8) {
  88. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  89. handle(), "AARCH64_INT8X8X16_K8X8X8");
  90. }
  91. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K4x4x16) {
  92. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  93. handle(), "AARCH64_INT8X8X16_K4X4X16");
  94. }
  95. TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_K12X8X1) {
  96. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  97. handle(), "AARCH64_INT16X16X32_K12X8X1");
  98. }
  99. TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_MK8) {
  100. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  101. handle(), "AARCH64_INT16X16X32_MK8_8X8",
  102. param::MatrixMul::Format::MK8, 1);
  103. }
  104. //! FIXME: need to add tests of GEMV and QUINT8
  105. #if MEGDNN_WITH_BENCHMARK
  106. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K4X16) {
  107. constexpr size_t RUNS = 50;
  108. param::MatrixMul param;
  109. param.transposeA = false;
  110. param.transposeB = false;
  111. Benchmarker<MatrixMul> benchmarker_K4X16(handle());
  112. Benchmarker<MatrixMul> benchmarker_K12X8(handle());
  113. benchmarker_K4X16.set_times(RUNS)
  114. .set_dtype(0, dtype::Float32{})
  115. .set_dtype(1, dtype::Float32{})
  116. .set_dtype(2, dtype::Float32{})
  117. .set_param(param)
  118. .set_display(false);
  119. benchmarker_K4X16.set_before_exec_callback(
  120. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  121. benchmarker_K12X8.set_before_exec_callback(
  122. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  123. benchmarker_K12X8.set_times(RUNS)
  124. .set_dtype(0, dtype::Float32{})
  125. .set_dtype(1, dtype::Float32{})
  126. .set_dtype(2, dtype::Float32{})
  127. .set_param(param)
  128. .set_display(false);
  129. auto run = [&](size_t M, size_t N, size_t K) {
  130. TensorShape A, B;
  131. if (param.transposeA) {
  132. A = TensorShape{K, M};
  133. } else {
  134. A = TensorShape{M, K};
  135. }
  136. if (param.transposeB) {
  137. B = TensorShape{N, K};
  138. } else {
  139. B = TensorShape{K, N};
  140. }
  141. auto k4x16_used = benchmarker_K4X16.exec({A, B, {}}) / RUNS;
  142. auto k12x8_used = benchmarker_K12X8.exec({A, B, {}}) / RUNS;
  143. float computations = 2.f * M * K * N * 1e-6;
  144. printf("run: {%zu{M} %zu{K} %zu{N}} k4x16: %f ms %f Gflops k12x8: %f "
  145. "ms "
  146. "%f Gflops k4x16_vs_k12x8: %f\n",
  147. M, K, N, k4x16_used, computations / k4x16_used, k12x8_used,
  148. computations / k12x8_used, k12x8_used / k4x16_used);
  149. };
  150. run(256, 256, 128);
  151. for (size_t k = 4; k <= 256; k *= 8) {
  152. for (size_t m = 4; m <= 256; m *= 4) {
  153. for (size_t n = 4; n <= 256; n *= 4) {
  154. run(m, n, k);
  155. }
  156. printf("\n");
  157. }
  158. printf("\n");
  159. }
  160. }
  161. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_8X8X8) {
  162. constexpr size_t RUNS = 50;
  163. param::MatrixMul param;
  164. param.transposeA = false;
  165. param.transposeB = false;
  166. Benchmarker<MatrixMul> benchmarker_int(handle());
  167. Benchmarker<MatrixMul> benchmarker_int32(handle());
  168. benchmarker_int.set_times(RUNS)
  169. .set_dtype(0, dtype::Int8{})
  170. .set_dtype(1, dtype::Int8{})
  171. .set_dtype(2, dtype::Int16{})
  172. .set_param(param)
  173. .set_display(false);
  174. benchmarker_int.set_before_exec_callback(
  175. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K8X8X8"));
  176. benchmarker_int32.set_before_exec_callback(
  177. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X8X8"));
  178. benchmarker_int32.set_times(RUNS)
  179. .set_dtype(0, dtype::Int8{})
  180. .set_dtype(1, dtype::Int8{})
  181. .set_dtype(2, dtype::Int32{})
  182. .set_param(param)
  183. .set_display(false);
  184. Benchmarker<MatrixMul> benchmarker_float(handle());
  185. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  186. auto run = [&](size_t M, size_t N, size_t K) {
  187. TensorShape A, B;
  188. if (param.transposeA) {
  189. A = TensorShape{K, M};
  190. } else {
  191. A = TensorShape{M, K};
  192. }
  193. if (param.transposeB) {
  194. B = TensorShape{N, K};
  195. } else {
  196. B = TensorShape{K, N};
  197. }
  198. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  199. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  200. auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
  201. float computations = 2.f * M * K * N * 1e-6;
  202. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  203. "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
  204. M, K, N, float_used, computations / float_used, int_used,
  205. computations / int_used, float_used / int_used,
  206. int32_used / int_used);
  207. };
  208. run(256, 256, 128);
  209. for (size_t k = 4; k <= 256; k *= 8) {
  210. for (size_t m = 4; m <= 256; m *= 4) {
  211. for (size_t n = 4; n <= 256; n *= 4) {
  212. run(m, n, k);
  213. }
  214. std::cout << std::endl;
  215. }
  216. std::cout << std::endl;
  217. }
  218. }
  219. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT32_MK_4X4X16) {
  220. constexpr size_t RUNS = 50;
  221. param::MatrixMul param;
  222. param.transposeA = false;
  223. param.transposeB = false;
  224. Benchmarker<MatrixMul> benchmarker(handle());
  225. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  226. benchmarker.set_times(RUNS)
  227. .set_dtype(0, dtype::Int8{})
  228. .set_dtype(1, dtype::Int8{})
  229. .set_dtype(2, dtype::Int32{})
  230. .set_param(param)
  231. .set_display(false);
  232. benchmarker.set_before_exec_callback(
  233. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
  234. param.format = MatrixMul::Param::Format::MK4;
  235. benchmarker_mk4.set_before_exec_callback(
  236. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_4X4X16"));
  237. benchmarker_mk4.set_times(RUNS)
  238. .set_dtype(0, dtype::Int8{})
  239. .set_dtype(1, dtype::Int8{})
  240. .set_dtype(2, dtype::Int32{})
  241. .set_param(param)
  242. .set_display(false);
  243. auto run = [&](size_t M, size_t N, size_t K) {
  244. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  245. auto mk_used = benchmarker_mk4.exec(
  246. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  247. RUNS;
  248. float computations = 2.f * M * K * N * 1e-6;
  249. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  250. "%f Gflops speedup_vs_normal: %f\n",
  251. M, K, N, default_used, computations / default_used, mk_used,
  252. computations / mk_used, default_used / mk_used);
  253. };
  254. run(256, 256, 128);
  255. for (size_t k = 4; k <= 512; k *= 2) {
  256. for (size_t m = 4; m <= 512; m *= 2) {
  257. for (size_t n = 4; n <= 512; n *= 2) {
  258. run(m, n, k);
  259. }
  260. }
  261. std::cout << std::endl;
  262. }
  263. }
  264. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_4X4X16) {
  265. constexpr size_t RUNS = 50;
  266. param::MatrixMul param;
  267. param.transposeA = false;
  268. param.transposeB = false;
  269. Benchmarker<MatrixMul> benchmarker_int(handle());
  270. Benchmarker<MatrixMul> benchmarker_int32(handle());
  271. benchmarker_int.set_times(RUNS)
  272. .set_dtype(0, dtype::Int8{})
  273. .set_dtype(1, dtype::Int8{})
  274. .set_dtype(2, dtype::Int16{})
  275. .set_param(param)
  276. .set_display(false);
  277. benchmarker_int.set_before_exec_callback(
  278. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  279. benchmarker_int32.set_before_exec_callback(
  280. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
  281. benchmarker_int32.set_times(RUNS)
  282. .set_dtype(0, dtype::Int8{})
  283. .set_dtype(1, dtype::Int8{})
  284. .set_dtype(2, dtype::Int32{})
  285. .set_param(param)
  286. .set_display(false);
  287. Benchmarker<MatrixMul> benchmarker_float(handle());
  288. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  289. auto run = [&](size_t M, size_t N, size_t K) {
  290. TensorShape A, B;
  291. if (param.transposeA) {
  292. A = TensorShape{K, M};
  293. } else {
  294. A = TensorShape{M, K};
  295. }
  296. if (param.transposeB) {
  297. B = TensorShape{N, K};
  298. } else {
  299. B = TensorShape{K, N};
  300. }
  301. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  302. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  303. auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
  304. float computations = 2.f * M * K * N * 1e-6;
  305. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  306. "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
  307. M, K, N, float_used, computations / float_used, int_used,
  308. computations / int_used, float_used / int_used,
  309. int32_used / int_used);
  310. };
  311. run(256, 256, 128);
  312. for (size_t k = 4; k <= 16; k *= 2) {
  313. for (size_t m = 4; m <= 64; m *= 2) {
  314. for (size_t n = 4; n <= 64; n *= 2) {
  315. run(m, n, k);
  316. }
  317. }
  318. std::cout << std::endl;
  319. }
  320. }
  321. TEST_F(AARCH64, BENCHMARK_GEMV) {
  322. int exec_times = 10;
  323. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  324. benchmarker_gemm.set_times(exec_times);
  325. float mod = 1000 * exec_times / 1e9;
  326. auto run = [&](size_t M, size_t K, size_t N) {
  327. float time = 1.f, perf = 1.f;
  328. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")"
  329. << std::endl;
  330. benchmarker_gemm.set_dtype(0, dtype::Float32())
  331. .set_dtype(1, dtype::Float32());
  332. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  333. perf = 2.f * M * K * N / time * mod;
  334. std::cout << "gemm fp32, Performance is " << perf << " Gflops"
  335. << std::endl;
  336. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  337. benchmarker_gemm.set_dtype(0, dtype::Float16())
  338. .set_dtype(1, dtype::Float16());
  339. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  340. perf = 2.f * M * K * N / time * mod;
  341. std::cout << "gemm fp16, Performance is " << perf << " Gflops"
  342. << std::endl;
  343. #endif
  344. };
  345. std::cout << "warm up:\n";
  346. for (int i = 0; i < 50; i++) {
  347. benchmarker_gemm.set_dtype(0, dtype::Float32())
  348. .set_dtype(1, dtype::Float32())
  349. .set_display(false)
  350. .exec({{256, 256}, {256, 256}, {}});
  351. benchmarker_gemm.set_display(true);
  352. }
  353. // run gemv
  354. for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
  355. for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
  356. for (size_t N : {112})
  357. run(M, K, N);
  358. }
  359. #if __ARM_FEATURE_DOTPROD
  360. TEST_F(AARCH64, BENCHMARK_TRANSPOSED_MATRIX_MUL_INT_8X8X32) {
  361. constexpr size_t RUNS = 50;
  362. param::MatrixMul param;
  363. param.transposeA = param.transposeB = true;
  364. Benchmarker<MatrixMul> benchmarker_int(handle());
  365. benchmarker_int.set_times(RUNS)
  366. .set_dtype(0, dtype::Int8{})
  367. .set_dtype(1, dtype::Int8{})
  368. .set_dtype(2, {})
  369. .set_param(param)
  370. .set_display(false);
  371. Benchmarker<MatrixMul> benchmarker_float(handle());
  372. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  373. auto run = [&](size_t M, size_t N, size_t K) {
  374. auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
  375. auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
  376. float computations = 2.f * M * K * N * 1e-6;
  377. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  378. "%f Gflops speedup: %f\n",
  379. M, K, N, float_used, computations / float_used, int_used,
  380. computations / int_used, float_used / int_used);
  381. };
  382. run(256, 12 * 24, 256);
  383. for (size_t M : {8, 64, 112, 256}) {
  384. for (size_t K : {8, 64, 112, 256}) {
  385. for (size_t N : {8, 64, 112, 256}) {
  386. run(M, N, K);
  387. }
  388. }
  389. }
  390. }
  391. TEST_F(AARCH64, BENCHMARK_GEMV_INT_8X8X32) {
  392. constexpr size_t RUNS = 50;
  393. param::MatrixMul param;
  394. Benchmarker<MatrixMul> benchmarker_int(handle());
  395. benchmarker_int.set_times(RUNS)
  396. .set_dtype(0, dtype::Int8{})
  397. .set_dtype(1, dtype::Int8{})
  398. .set_dtype(2, {})
  399. .set_param(param)
  400. .set_display(false);
  401. Benchmarker<MatrixMul> benchmarker_float(handle());
  402. benchmarker_float.set_display(false).set_times(RUNS);
  403. auto run = [&](size_t M, size_t N, size_t K) {
  404. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  405. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  406. float computations = 2.f * M * K * N * 1e-6;
  407. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  408. "%f Gflops speedup: %f\n",
  409. M, K, N, float_used, computations / float_used, int_used,
  410. computations / int_used, float_used / int_used);
  411. };
  412. for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  413. for (size_t N : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  414. for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  415. run(M, N, K);
  416. }
  417. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT8X8X32_MK4_8X12X4) {
  418. constexpr size_t RUNS = 50;
  419. param::MatrixMul param;
  420. param.transposeA = false;
  421. param.transposeB = false;
  422. Benchmarker<MatrixMul> benchmarker(handle());
  423. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  424. benchmarker.set_times(RUNS)
  425. .set_dtype(0, dtype::Int8{})
  426. .set_dtype(1, dtype::Int8{})
  427. .set_dtype(2, dtype::Int32{})
  428. .set_param(param)
  429. .set_display(false);
  430. benchmarker.set_before_exec_callback(
  431. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X12X4"));
  432. param.format = MatrixMul::Param::Format::MK4_DOT;
  433. benchmarker_mk4.set_before_exec_callback(
  434. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD"));
  435. benchmarker_mk4.set_times(RUNS)
  436. .set_dtype(0, dtype::Int8{})
  437. .set_dtype(1, dtype::Int8{})
  438. .set_dtype(2, dtype::Int32{})
  439. .set_param(param)
  440. .set_display(false);
  441. auto run = [&](size_t M, size_t N, size_t K) {
  442. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  443. auto mk_used = benchmarker_mk4.exec(
  444. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  445. RUNS;
  446. float computations = 2.f * M * K * N * 1e-6;
  447. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  448. "%f Gflops speedup_vs_normal: %f\n",
  449. M, K, N, default_used, computations / default_used, mk_used,
  450. computations / mk_used, default_used / mk_used);
  451. };
  452. run(256, 256, 128);
  453. for (size_t k = 4; k <= 512; k *= 2) {
  454. for (size_t m = 4; m <= 512; m *= 2) {
  455. for (size_t n = 4; n <= 512; n *= 2) {
  456. run(m, n, k);
  457. }
  458. }
  459. std::cout << std::endl;
  460. }
  461. }
  462. #endif // __ARM_FEATURE_DOTPROD
  463. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  464. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_F16_MK8) {
  465. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  466. matrix_mul::benchmark_with_contrast(
  467. handle(), args, dtype::Float16{}, dtype::Float16{},
  468. dtype::Float16{}, "AARCH64_F16_MK8_8X8",
  469. param::MatrixMul::Format::MK8, dtype::Float16{}, dtype::Float16{},
  470. dtype::Float16{}, "AARCH64_F16_K8X24X1");
  471. }
  472. #endif
  473. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32) {
  474. constexpr size_t RUNS = 50;
  475. Benchmarker<MatrixMul> benchmarker_int(handle());
  476. benchmarker_int.set_times(RUNS)
  477. .set_dtype(0, dtype::Int16{})
  478. .set_dtype(1, dtype::Int16{})
  479. .set_dtype(2, dtype::Int32{})
  480. .set_display(false);
  481. Benchmarker<MatrixMul> benchmarker_float(handle());
  482. benchmarker_float.set_display(false).set_times(RUNS);
  483. auto run = [&](size_t M, size_t N, size_t K, int mask) {
  484. param::MatrixMul param;
  485. param.transposeA = mask & 0x1;
  486. param.transposeB = mask & 0x2;
  487. benchmarker_int.set_param(param);
  488. benchmarker_float.set_param(param);
  489. TensorShape A, B;
  490. if (param.transposeA) {
  491. A = TensorShape{K, M};
  492. } else {
  493. A = TensorShape{M, K};
  494. }
  495. if (param.transposeB) {
  496. B = TensorShape{N, K};
  497. } else {
  498. B = TensorShape{K, N};
  499. }
  500. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  501. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  502. float computations = 2.f * M * K * N * 1e-6;
  503. printf("run: {%zu{M} %zu{K} %zu{N} %d{TA} %d{TB}} "
  504. "float: %f ms %f Gflops int: %f ms "
  505. "%f Gflops speedup: %f\n",
  506. M, K, N, param.transposeA, param.transposeB, float_used,
  507. computations / float_used, int_used, computations / int_used,
  508. float_used / int_used);
  509. };
  510. constexpr int mask = 4;
  511. for (auto i = 0; i < mask; i++) {
  512. for (size_t M : {8, 64, 112, 256}) {
  513. for (size_t K : {8, 64, 112, 256}) {
  514. for (size_t N : {8, 64, 112, 256}) {
  515. run(M, N, K, i);
  516. }
  517. }
  518. }
  519. }
  520. }
  521. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_MK4) {
  522. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
  523. matrix_mul::benchmark_with_contrast(
  524. handle(), args, dtype::Float32{}, dtype::Float32{},
  525. dtype::Float32{}, "AARCH64_F32_MK4_4x16",
  526. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  527. dtype::Float32{});
  528. }
  529. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_PACK_MK4) {
  530. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
  531. matrix_mul::benchmark_with_contrast(
  532. handle(), args, dtype::Float32{}, dtype::Float32{},
  533. dtype::Float32{}, "AARCH64_F32_MK4_K8X12X1",
  534. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  535. dtype::Float32{}, "AARCH64_F32K8X12X1");
  536. }
  537. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
  538. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  539. matrix_mul::benchmark_with_contrast(
  540. handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  541. "AARCH64_INT16X16X32_MK8_8X8", param::MatrixMul::Format::MK8,
  542. dtype::Int16{}, dtype::Int16{}, dtype::Int32{});
  543. }
  544. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12) {
  545. constexpr size_t RUNS = 50;
  546. param::MatrixMul param;
  547. param.transposeA = param.transposeB = true;
  548. Benchmarker<MatrixMul> benchmarker_k12x8(handle());
  549. Benchmarker<MatrixMul> benchmarker_k8x12(handle());
  550. benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
  551. benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
  552. benchmarker_k12x8.set_before_exec_callback(
  553. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  554. benchmarker_k8x12.set_before_exec_callback(
  555. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  556. auto run = [&](size_t M, size_t N, size_t K) {
  557. auto k12x8_used = benchmarker_k12x8.exec({{K, M}, {N, K}, {}}) / RUNS;
  558. auto k8x12_used = benchmarker_k8x12.exec({{K, M}, {N, K}, {}}) / RUNS;
  559. float computations = 2.f * M * K * N * 1e-6;
  560. printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
  561. "k8x12: %f ms "
  562. "%f Gflops speedup: %f\n",
  563. M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
  564. computations / k8x12_used, k12x8_used / k8x12_used);
  565. };
  566. run(256, 12 * 24, 256);
  567. for (size_t M : {8, 64, 112, 256}) {
  568. for (size_t K : {8, 64, 112, 256}) {
  569. for (size_t N : {8, 64, 112, 256}) {
  570. run(M, N, K);
  571. }
  572. }
  573. }
  574. }
  575. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12_NO_TRANS) {
  576. constexpr size_t RUNS = 50;
  577. param::MatrixMul param;
  578. param.transposeA = param.transposeB = false;
  579. Benchmarker<MatrixMul> benchmarker_k12x8(handle());
  580. Benchmarker<MatrixMul> benchmarker_k8x12(handle());
  581. benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
  582. benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
  583. benchmarker_k12x8.set_before_exec_callback(
  584. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  585. benchmarker_k8x12.set_before_exec_callback(
  586. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  587. auto run = [&](size_t M, size_t N, size_t K) {
  588. auto k12x8_used = benchmarker_k12x8.exec({{M, K}, {K, N}, {}}) / RUNS;
  589. auto k8x12_used = benchmarker_k8x12.exec({{M, K}, {K, N}, {}}) / RUNS;
  590. float computations = 2.f * M * K * N * 1e-6;
  591. printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
  592. "k8x12: %f ms "
  593. "%f Gflops speedup: %f\n",
  594. M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
  595. computations / k8x12_used, k12x8_used / k8x12_used);
  596. };
  597. run(256, 12 * 24, 256);
  598. for (size_t M : {8, 64, 112, 256}) {
  599. for (size_t K : {8, 64, 112, 256}) {
  600. for (size_t N : {8, 64, 112, 256}) {
  601. run(M, N, K);
  602. }
  603. }
  604. }
  605. }
  606. #endif // MEGDNN_WITH_BENCHMARK
  607. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台