You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687
  1. /**
  2. * \file dnn/test/aarch64/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/aarch64/fixture.h"
  12. #include "test/common/benchmarker.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/matrix_mul.h"
  15. #include "test/common/rng.h"
  16. using namespace megdnn;
  17. using namespace test;
  18. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12) {
  19. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  20. dtype::Float32{}, handle(),
  21. "AARCH64_F32K8X12X1");
  22. }
  23. TEST_F(AARCH64, MATRIX_MUL_FP32K4X16) {
  24. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  25. dtype::Float32{}, handle(),
  26. "AARCH64_F32K4X16X1");
  27. }
  28. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4) {
  29. matrix_mul::check_matrix_mul(
  30. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  31. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  32. }
  33. TEST_F(AARCH64, MATRIX_MUL_FP32_MK4) {
  34. //! nbase should be 4 in order to test the last rest 4 in N dim
  35. matrix_mul::check_matrix_mul(
  36. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  37. "AARCH64_F32_MK4_4x16", param::MatrixMul::Format::MK4, 4);
  38. }
  39. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  40. TEST_F(AARCH64, MATRIX_MUL_F16_K8X24X1) {
  41. matrix_mul::check_matrix_mul(dtype::Float16{}, dtype::Float16{},
  42. dtype::Float16{}, handle(),
  43. "AARCH64_F16_K8X24X1");
  44. }
  45. TEST_F(AARCH64, MATRIX_MUL_F16_MK8) {
  46. //! nbase should be 4 in order to test the last rest 4 in N dim
  47. matrix_mul::check_matrix_mul(
  48. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  49. "AARCH64_F16_MK8_8X8", param::MatrixMul::Format::MK8, 4);
  50. }
  51. #endif
  52. #if __ARM_FEATURE_DOTPROD
  53. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K8X12X4_DOTPROD) {
  54. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  55. handle(), "AARCH64_INT8X8X32_K8X12X4_DOTPROD");
  56. }
  57. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_MK4_8X12X4_DOTPROD) {
  58. std::vector<matrix_mul::TestArg> args;
  59. for (size_t m : {1, 2, 3, 4, 5, 6, 7, 10, 11})
  60. for (size_t n : {2, 3, 4, 5, 8, 12, 13, 14, 15, 16, 31})
  61. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  62. args.emplace_back(m, n, k, 0);
  63. matrix_mul::check_matrix_mul(
  64. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  65. "AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD",
  66. param::MatrixMul::Format::MK4_DOT, 1, 1e-3, std::move(args));
  67. }
  68. #else
  69. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K4X4X16) {
  70. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  71. handle(), "AARCH64_INT8X8X32_K4X4X16");
  72. }
  73. TEST_F(AARCH64, MATRIX_MUL_INT8_MK4) {
  74. std::vector<matrix_mul::TestArg> args;
  75. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  76. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  77. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  78. args.emplace_back(m, n, k, 0);
  79. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  80. handle(), "AARCH64_INT8X8X32_MK4_4X4X16",
  81. param::MatrixMul::Format::MK4, 1, 1e-3,
  82. std::move(args));
  83. }
  84. TEST_F(AARCH64, MATRIX_MUL_INT8x8x32_K8x8x8) {
  85. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  86. handle(), "AARCH64_INT8X8X32_K8X8X8");
  87. }
  88. #endif
  89. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K8x8x8) {
  90. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  91. handle(), "AARCH64_INT8X8X16_K8X8X8");
  92. }
  93. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K4x4x16) {
  94. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  95. handle(), "AARCH64_INT8X8X16_K4X4X16");
  96. }
  97. TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_K12X8X1) {
  98. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  99. handle(), "AARCH64_INT16X16X32_K12X8X1");
  100. }
  101. TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_MK8) {
  102. //! nbase should be 4 in order to test the last rest 4 in N dim
  103. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  104. handle(), "AARCH64_INT16X16X32_MK8_8X8",
  105. param::MatrixMul::Format::MK8, 4);
  106. }
  107. //! FIXME: need to add tests of GEMV and QUINT8
  108. #if MEGDNN_WITH_BENCHMARK
  109. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K4X16) {
  110. constexpr size_t RUNS = 50;
  111. param::MatrixMul param;
  112. param.transposeA = false;
  113. param.transposeB = false;
  114. Benchmarker<MatrixMul> benchmarker_K4X16(handle());
  115. Benchmarker<MatrixMul> benchmarker_K12X8(handle());
  116. benchmarker_K4X16.set_times(RUNS)
  117. .set_dtype(0, dtype::Float32{})
  118. .set_dtype(1, dtype::Float32{})
  119. .set_dtype(2, dtype::Float32{})
  120. .set_param(param)
  121. .set_display(false);
  122. benchmarker_K4X16.set_before_exec_callback(
  123. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  124. benchmarker_K12X8.set_before_exec_callback(
  125. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  126. benchmarker_K12X8.set_times(RUNS)
  127. .set_dtype(0, dtype::Float32{})
  128. .set_dtype(1, dtype::Float32{})
  129. .set_dtype(2, dtype::Float32{})
  130. .set_param(param)
  131. .set_display(false);
  132. auto run = [&](size_t M, size_t N, size_t K) {
  133. TensorShape A, B;
  134. if (param.transposeA) {
  135. A = TensorShape{K, M};
  136. } else {
  137. A = TensorShape{M, K};
  138. }
  139. if (param.transposeB) {
  140. B = TensorShape{N, K};
  141. } else {
  142. B = TensorShape{K, N};
  143. }
  144. auto k4x16_used = benchmarker_K4X16.exec({A, B, {}}) / RUNS;
  145. auto k12x8_used = benchmarker_K12X8.exec({A, B, {}}) / RUNS;
  146. float computations = 2.f * M * K * N * 1e-6;
  147. printf("run: {%zu{M} %zu{K} %zu{N}} k4x16: %f ms %f Gflops k12x8: %f "
  148. "ms "
  149. "%f Gflops k4x16_vs_k12x8: %f\n",
  150. M, K, N, k4x16_used, computations / k4x16_used, k12x8_used,
  151. computations / k12x8_used, k12x8_used / k4x16_used);
  152. };
  153. run(256, 256, 128);
  154. for (size_t k = 4; k <= 256; k *= 8) {
  155. for (size_t m = 4; m <= 256; m *= 4) {
  156. for (size_t n = 4; n <= 256; n *= 4) {
  157. run(m, n, k);
  158. }
  159. printf("\n");
  160. }
  161. printf("\n");
  162. }
  163. }
  164. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_8X8X8) {
  165. constexpr size_t RUNS = 50;
  166. param::MatrixMul param;
  167. param.transposeA = false;
  168. param.transposeB = false;
  169. Benchmarker<MatrixMul> benchmarker_int(handle());
  170. Benchmarker<MatrixMul> benchmarker_int32(handle());
  171. benchmarker_int.set_times(RUNS)
  172. .set_dtype(0, dtype::Int8{})
  173. .set_dtype(1, dtype::Int8{})
  174. .set_dtype(2, dtype::Int16{})
  175. .set_param(param)
  176. .set_display(false);
  177. benchmarker_int.set_before_exec_callback(
  178. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K8X8X8"));
  179. benchmarker_int32.set_before_exec_callback(
  180. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X8X8"));
  181. benchmarker_int32.set_times(RUNS)
  182. .set_dtype(0, dtype::Int8{})
  183. .set_dtype(1, dtype::Int8{})
  184. .set_dtype(2, dtype::Int32{})
  185. .set_param(param)
  186. .set_display(false);
  187. Benchmarker<MatrixMul> benchmarker_float(handle());
  188. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  189. auto run = [&](size_t M, size_t N, size_t K) {
  190. TensorShape A, B;
  191. if (param.transposeA) {
  192. A = TensorShape{K, M};
  193. } else {
  194. A = TensorShape{M, K};
  195. }
  196. if (param.transposeB) {
  197. B = TensorShape{N, K};
  198. } else {
  199. B = TensorShape{K, N};
  200. }
  201. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  202. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  203. auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
  204. float computations = 2.f * M * K * N * 1e-6;
  205. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  206. "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
  207. M, K, N, float_used, computations / float_used, int_used,
  208. computations / int_used, float_used / int_used,
  209. int32_used / int_used);
  210. };
  211. run(256, 256, 128);
  212. for (size_t k = 4; k <= 256; k *= 8) {
  213. for (size_t m = 4; m <= 256; m *= 4) {
  214. for (size_t n = 4; n <= 256; n *= 4) {
  215. run(m, n, k);
  216. }
  217. std::cout << std::endl;
  218. }
  219. std::cout << std::endl;
  220. }
  221. }
  222. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT32_MK_4X4X16) {
  223. constexpr size_t RUNS = 50;
  224. param::MatrixMul param;
  225. param.transposeA = false;
  226. param.transposeB = false;
  227. Benchmarker<MatrixMul> benchmarker(handle());
  228. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  229. benchmarker.set_times(RUNS)
  230. .set_dtype(0, dtype::Int8{})
  231. .set_dtype(1, dtype::Int8{})
  232. .set_dtype(2, dtype::Int32{})
  233. .set_param(param)
  234. .set_display(false);
  235. benchmarker.set_before_exec_callback(
  236. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
  237. param.format = MatrixMul::Param::Format::MK4;
  238. benchmarker_mk4.set_before_exec_callback(
  239. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_4X4X16"));
  240. benchmarker_mk4.set_times(RUNS)
  241. .set_dtype(0, dtype::Int8{})
  242. .set_dtype(1, dtype::Int8{})
  243. .set_dtype(2, dtype::Int32{})
  244. .set_param(param)
  245. .set_display(false);
  246. auto run = [&](size_t M, size_t N, size_t K) {
  247. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  248. auto mk_used = benchmarker_mk4.exec(
  249. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  250. RUNS;
  251. float computations = 2.f * M * K * N * 1e-6;
  252. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  253. "%f Gflops speedup_vs_normal: %f\n",
  254. M, K, N, default_used, computations / default_used, mk_used,
  255. computations / mk_used, default_used / mk_used);
  256. };
  257. run(256, 256, 128);
  258. for (size_t k = 4; k <= 512; k *= 2) {
  259. for (size_t m = 4; m <= 512; m *= 2) {
  260. for (size_t n = 4; n <= 512; n *= 2) {
  261. run(m, n, k);
  262. }
  263. }
  264. std::cout << std::endl;
  265. }
  266. }
  267. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_4X4X16) {
  268. constexpr size_t RUNS = 50;
  269. param::MatrixMul param;
  270. param.transposeA = false;
  271. param.transposeB = false;
  272. Benchmarker<MatrixMul> benchmarker_int(handle());
  273. Benchmarker<MatrixMul> benchmarker_int32(handle());
  274. benchmarker_int.set_times(RUNS)
  275. .set_dtype(0, dtype::Int8{})
  276. .set_dtype(1, dtype::Int8{})
  277. .set_dtype(2, dtype::Int16{})
  278. .set_param(param)
  279. .set_display(false);
  280. benchmarker_int.set_before_exec_callback(
  281. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  282. benchmarker_int32.set_before_exec_callback(
  283. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
  284. benchmarker_int32.set_times(RUNS)
  285. .set_dtype(0, dtype::Int8{})
  286. .set_dtype(1, dtype::Int8{})
  287. .set_dtype(2, dtype::Int32{})
  288. .set_param(param)
  289. .set_display(false);
  290. Benchmarker<MatrixMul> benchmarker_float(handle());
  291. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  292. auto run = [&](size_t M, size_t N, size_t K) {
  293. TensorShape A, B;
  294. if (param.transposeA) {
  295. A = TensorShape{K, M};
  296. } else {
  297. A = TensorShape{M, K};
  298. }
  299. if (param.transposeB) {
  300. B = TensorShape{N, K};
  301. } else {
  302. B = TensorShape{K, N};
  303. }
  304. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  305. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  306. auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
  307. float computations = 2.f * M * K * N * 1e-6;
  308. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  309. "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
  310. M, K, N, float_used, computations / float_used, int_used,
  311. computations / int_used, float_used / int_used,
  312. int32_used / int_used);
  313. };
  314. run(256, 256, 128);
  315. for (size_t k = 4; k <= 16; k *= 2) {
  316. for (size_t m = 4; m <= 64; m *= 2) {
  317. for (size_t n = 4; n <= 64; n *= 2) {
  318. run(m, n, k);
  319. }
  320. }
  321. std::cout << std::endl;
  322. }
  323. }
  324. TEST_F(AARCH64, BENCHMARK_GEMV) {
  325. int exec_times = 10;
  326. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  327. benchmarker_gemm.set_times(exec_times);
  328. float mod = 1000 * exec_times / 1e9;
  329. auto run = [&](size_t M, size_t K, size_t N) {
  330. float time = 1.f, perf = 1.f;
  331. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")"
  332. << std::endl;
  333. benchmarker_gemm.set_dtype(0, dtype::Float32())
  334. .set_dtype(1, dtype::Float32());
  335. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  336. perf = 2.f * M * K * N / time * mod;
  337. std::cout << "gemm fp32, Performance is " << perf << " Gflops"
  338. << std::endl;
  339. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  340. benchmarker_gemm.set_dtype(0, dtype::Float16())
  341. .set_dtype(1, dtype::Float16());
  342. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  343. perf = 2.f * M * K * N / time * mod;
  344. std::cout << "gemm fp16, Performance is " << perf << " Gflops"
  345. << std::endl;
  346. #endif
  347. };
  348. std::cout << "warm up:\n";
  349. for (int i = 0; i < 50; i++) {
  350. benchmarker_gemm.set_dtype(0, dtype::Float32())
  351. .set_dtype(1, dtype::Float32())
  352. .set_display(false)
  353. .exec({{256, 256}, {256, 256}, {}});
  354. benchmarker_gemm.set_display(true);
  355. }
  356. // run gemv
  357. for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
  358. for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
  359. for (size_t N : {112})
  360. run(M, K, N);
  361. }
  362. #if __ARM_FEATURE_DOTPROD
  363. TEST_F(AARCH64, BENCHMARK_TRANSPOSED_MATRIX_MUL_INT_8X8X32) {
  364. constexpr size_t RUNS = 50;
  365. param::MatrixMul param;
  366. param.transposeA = param.transposeB = true;
  367. Benchmarker<MatrixMul> benchmarker_int(handle());
  368. benchmarker_int.set_times(RUNS)
  369. .set_dtype(0, dtype::Int8{})
  370. .set_dtype(1, dtype::Int8{})
  371. .set_dtype(2, {})
  372. .set_param(param)
  373. .set_display(false);
  374. Benchmarker<MatrixMul> benchmarker_float(handle());
  375. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  376. auto run = [&](size_t M, size_t N, size_t K) {
  377. auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
  378. auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
  379. float computations = 2.f * M * K * N * 1e-6;
  380. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  381. "%f Gflops speedup: %f\n",
  382. M, K, N, float_used, computations / float_used, int_used,
  383. computations / int_used, float_used / int_used);
  384. };
  385. run(256, 12 * 24, 256);
  386. for (size_t M : {8, 64, 112, 256}) {
  387. for (size_t K : {8, 64, 112, 256}) {
  388. for (size_t N : {8, 64, 112, 256}) {
  389. run(M, N, K);
  390. }
  391. }
  392. }
  393. }
  394. TEST_F(AARCH64, BENCHMARK_GEMV_INT_8X8X32) {
  395. constexpr size_t RUNS = 50;
  396. param::MatrixMul param;
  397. Benchmarker<MatrixMul> benchmarker_int(handle());
  398. benchmarker_int.set_times(RUNS)
  399. .set_dtype(0, dtype::Int8{})
  400. .set_dtype(1, dtype::Int8{})
  401. .set_dtype(2, {})
  402. .set_param(param)
  403. .set_display(false);
  404. Benchmarker<MatrixMul> benchmarker_float(handle());
  405. benchmarker_float.set_display(false).set_times(RUNS);
  406. auto run = [&](size_t M, size_t N, size_t K) {
  407. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  408. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  409. float computations = 2.f * M * K * N * 1e-6;
  410. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  411. "%f Gflops speedup: %f\n",
  412. M, K, N, float_used, computations / float_used, int_used,
  413. computations / int_used, float_used / int_used);
  414. };
  415. for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  416. for (size_t N : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  417. for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  418. run(M, N, K);
  419. }
  420. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT8X8X32_MK4_8X12X4) {
  421. constexpr size_t RUNS = 50;
  422. param::MatrixMul param;
  423. param.transposeA = false;
  424. param.transposeB = false;
  425. Benchmarker<MatrixMul> benchmarker(handle());
  426. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  427. benchmarker.set_times(RUNS)
  428. .set_dtype(0, dtype::Int8{})
  429. .set_dtype(1, dtype::Int8{})
  430. .set_dtype(2, dtype::Int32{})
  431. .set_param(param)
  432. .set_display(false);
  433. benchmarker.set_before_exec_callback(
  434. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X12X4"));
  435. param.format = MatrixMul::Param::Format::MK4_DOT;
  436. benchmarker_mk4.set_before_exec_callback(
  437. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD"));
  438. benchmarker_mk4.set_times(RUNS)
  439. .set_dtype(0, dtype::Int8{})
  440. .set_dtype(1, dtype::Int8{})
  441. .set_dtype(2, dtype::Int32{})
  442. .set_param(param)
  443. .set_display(false);
  444. auto run = [&](size_t M, size_t N, size_t K) {
  445. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  446. auto mk_used = benchmarker_mk4.exec(
  447. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  448. RUNS;
  449. float computations = 2.f * M * K * N * 1e-6;
  450. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  451. "%f Gflops speedup_vs_normal: %f\n",
  452. M, K, N, default_used, computations / default_used, mk_used,
  453. computations / mk_used, default_used / mk_used);
  454. };
  455. run(256, 256, 128);
  456. for (size_t k = 4; k <= 512; k *= 2) {
  457. for (size_t m = 4; m <= 512; m *= 2) {
  458. for (size_t n = 4; n <= 512; n *= 2) {
  459. run(m, n, k);
  460. }
  461. }
  462. std::cout << std::endl;
  463. }
  464. }
  465. #endif // __ARM_FEATURE_DOTPROD
  466. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  467. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_F16_MK8) {
  468. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  469. matrix_mul::benchmark_with_contrast(
  470. handle(), args, dtype::Float16{}, dtype::Float16{},
  471. dtype::Float16{}, "AARCH64_F16_MK8_8X8",
  472. param::MatrixMul::Format::MK8, dtype::Float16{}, dtype::Float16{},
  473. dtype::Float16{}, "AARCH64_F16_K8X24X1");
  474. }
  475. #endif
  476. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32) {
  477. constexpr size_t RUNS = 50;
  478. Benchmarker<MatrixMul> benchmarker_int(handle());
  479. benchmarker_int.set_times(RUNS)
  480. .set_dtype(0, dtype::Int16{})
  481. .set_dtype(1, dtype::Int16{})
  482. .set_dtype(2, dtype::Int32{})
  483. .set_display(false);
  484. Benchmarker<MatrixMul> benchmarker_float(handle());
  485. benchmarker_float.set_display(false).set_times(RUNS);
  486. auto run = [&](size_t M, size_t N, size_t K, int mask) {
  487. param::MatrixMul param;
  488. param.transposeA = mask & 0x1;
  489. param.transposeB = mask & 0x2;
  490. benchmarker_int.set_param(param);
  491. benchmarker_float.set_param(param);
  492. TensorShape A, B;
  493. if (param.transposeA) {
  494. A = TensorShape{K, M};
  495. } else {
  496. A = TensorShape{M, K};
  497. }
  498. if (param.transposeB) {
  499. B = TensorShape{N, K};
  500. } else {
  501. B = TensorShape{K, N};
  502. }
  503. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  504. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  505. float computations = 2.f * M * K * N * 1e-6;
  506. printf("run: {%zu{M} %zu{K} %zu{N} %d{TA} %d{TB}} "
  507. "float: %f ms %f Gflops int: %f ms "
  508. "%f Gflops speedup: %f\n",
  509. M, K, N, param.transposeA, param.transposeB, float_used,
  510. computations / float_used, int_used, computations / int_used,
  511. float_used / int_used);
  512. };
  513. constexpr int mask = 4;
  514. for (auto i = 0; i < mask; i++) {
  515. for (size_t M : {8, 64, 112, 256}) {
  516. for (size_t K : {8, 64, 112, 256}) {
  517. for (size_t N : {8, 64, 112, 256}) {
  518. run(M, N, K, i);
  519. }
  520. }
  521. }
  522. }
  523. }
  524. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_MK4) {
  525. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
  526. matrix_mul::benchmark_with_contrast(
  527. handle(), args, dtype::Float32{}, dtype::Float32{},
  528. dtype::Float32{}, "AARCH64_F32_MK4_4x16",
  529. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  530. dtype::Float32{});
  531. }
  532. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_PACK_MK4) {
  533. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
  534. matrix_mul::benchmark_with_contrast(
  535. handle(), args, dtype::Float32{}, dtype::Float32{},
  536. dtype::Float32{}, "AARCH64_F32_MK4_K8X12X1",
  537. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  538. dtype::Float32{}, "AARCH64_F32K8X12X1");
  539. }
  540. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
  541. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  542. matrix_mul::benchmark_with_contrast(
  543. handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  544. "AARCH64_INT16X16X32_MK8_8X8", param::MatrixMul::Format::MK8,
  545. dtype::Int16{}, dtype::Int16{}, dtype::Int32{});
  546. }
  547. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12) {
  548. constexpr size_t RUNS = 50;
  549. param::MatrixMul param;
  550. param.transposeA = param.transposeB = true;
  551. Benchmarker<MatrixMul> benchmarker_k12x8(handle());
  552. Benchmarker<MatrixMul> benchmarker_k8x12(handle());
  553. benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
  554. benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
  555. benchmarker_k12x8.set_before_exec_callback(
  556. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  557. benchmarker_k8x12.set_before_exec_callback(
  558. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  559. auto run = [&](size_t M, size_t N, size_t K) {
  560. auto k12x8_used = benchmarker_k12x8.exec({{K, M}, {N, K}, {}}) / RUNS;
  561. auto k8x12_used = benchmarker_k8x12.exec({{K, M}, {N, K}, {}}) / RUNS;
  562. float computations = 2.f * M * K * N * 1e-6;
  563. printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
  564. "k8x12: %f ms "
  565. "%f Gflops speedup: %f\n",
  566. M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
  567. computations / k8x12_used, k12x8_used / k8x12_used);
  568. };
  569. run(256, 12 * 24, 256);
  570. for (size_t M : {8, 64, 112, 256}) {
  571. for (size_t K : {8, 64, 112, 256}) {
  572. for (size_t N : {8, 64, 112, 256}) {
  573. run(M, N, K);
  574. }
  575. }
  576. }
  577. }
  578. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12_NO_TRANS) {
  579. constexpr size_t RUNS = 50;
  580. param::MatrixMul param;
  581. param.transposeA = param.transposeB = false;
  582. Benchmarker<MatrixMul> benchmarker_k12x8(handle());
  583. Benchmarker<MatrixMul> benchmarker_k8x12(handle());
  584. benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
  585. benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
  586. benchmarker_k12x8.set_before_exec_callback(
  587. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  588. benchmarker_k8x12.set_before_exec_callback(
  589. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  590. auto run = [&](size_t M, size_t N, size_t K) {
  591. auto k12x8_used = benchmarker_k12x8.exec({{M, K}, {K, N}, {}}) / RUNS;
  592. auto k8x12_used = benchmarker_k8x12.exec({{M, K}, {K, N}, {}}) / RUNS;
  593. float computations = 2.f * M * K * N * 1e-6;
  594. printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
  595. "k8x12: %f ms "
  596. "%f Gflops speedup: %f\n",
  597. M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
  598. computations / k8x12_used, k12x8_used / k8x12_used);
  599. };
  600. run(256, 12 * 24, 256);
  601. for (size_t M : {8, 64, 112, 256}) {
  602. for (size_t K : {8, 64, 112, 256}) {
  603. for (size_t N : {8, 64, 112, 256}) {
  604. run(M, N, K);
  605. }
  606. }
  607. }
  608. }
  609. #endif // MEGDNN_WITH_BENCHMARK
  610. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台