You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958
  1. #include "test/aarch64/fixture.h"
  2. #include "test/common/benchmarker.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/matrix_mul.h"
  5. #include "test/common/rng.h"
  6. #include "test/arm_common/cpuinfo_help.h"
  7. using namespace megdnn;
  8. using namespace test;
  9. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12) {
  10. matrix_mul::check_matrix_mul(
  11. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  12. "AARCH64_F32K8X12X1");
  13. }
  14. #if MGB_ENABLE_CPUINFO
  15. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12_A53) {
  16. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a53);
  17. matrix_mul::check_matrix_mul(
  18. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  19. "AARCH64_F32K8X12X1");
  20. }
  21. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12_A55) {
  22. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a55);
  23. matrix_mul::check_matrix_mul(
  24. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  25. "AARCH64_F32K8X12X1");
  26. }
  27. #endif
  28. TEST_F(AARCH64, MATRIX_MUL_FP32K4X16) {
  29. matrix_mul::check_matrix_mul(
  30. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  31. "AARCH64_F32K4X16X1");
  32. }
  33. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4) {
  34. matrix_mul::check_matrix_mul(
  35. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  36. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  37. }
  38. #if MGB_ENABLE_CPUINFO
  39. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4_A53) {
  40. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a53);
  41. matrix_mul::check_matrix_mul(
  42. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  43. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  44. }
  45. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4_A55) {
  46. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a55);
  47. matrix_mul::check_matrix_mul(
  48. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  49. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  50. }
  51. #endif
  52. TEST_F(AARCH64, MATRIX_MUL_FP32_MK4) {
  53. matrix_mul::check_matrix_mul(
  54. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  55. "AARCH64_F32_MK4_4x16", param::MatrixMul::Format::MK4, 1);
  56. }
  57. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  58. TEST_F(AARCH64, MATRIX_MUL_F16_K8X24X1) {
  59. matrix_mul::check_matrix_mul(
  60. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  61. "AARCH64_F16_K8X24X1");
  62. }
  63. TEST_F(AARCH64, MATRIX_MUL_F16_MK8) {
  64. matrix_mul::check_matrix_mul(
  65. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  66. "AARCH64_F16_MK8_8X8", param::MatrixMul::Format::MK8, 1);
  67. }
  68. TEST_F(AARCH64, MATRIX_MUL_F16_MK8_16x12x1) {
  69. matrix_mul::check_matrix_mul(
  70. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  71. "AARCH64_F16_MK8_16X12X1", param::MatrixMul::Format::MK8, 1);
  72. }
  73. #endif
  74. #if MGB_ENABLE_DOT
  75. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K8X12X4_DOTPROD) {
  76. matrix_mul::check_matrix_mul(
  77. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  78. "AARCH64_INT8X8X32_K8X12X4_DOTPROD");
  79. }
  80. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_MK4_8X12X4_DOTPROD) {
  81. std::vector<matrix_mul::TestArg> args;
  82. for (size_t m : {1, 2, 3, 4, 5, 6, 7, 10, 11})
  83. for (size_t n : {2, 3, 4, 5, 8, 12, 13, 14, 15, 16, 31})
  84. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  85. args.emplace_back(m, n, k, 0);
  86. matrix_mul::check_matrix_mul(
  87. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  88. "AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD", param::MatrixMul::Format::MK4_DOT,
  89. 1, 1e-3, std::move(args));
  90. }
  91. #else
  92. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K4X4X16) {
  93. matrix_mul::check_matrix_mul(
  94. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  95. "AARCH64_INT8X8X32_K4X4X16");
  96. }
  97. TEST_F(AARCH64, MATRIX_MUL_INT8_MK4) {
  98. std::vector<matrix_mul::TestArg> args;
  99. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  100. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  101. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  102. args.emplace_back(m, n, k, 0);
  103. matrix_mul::check_matrix_mul(
  104. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  105. "AARCH64_INT8X8X32_MK4_4X4X16", param::MatrixMul::Format::MK4, 1, 1e-3,
  106. std::move(args));
  107. }
  108. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_MK4) {
  109. std::vector<matrix_mul::TestArg> args;
  110. for (size_t m : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17})
  111. for (size_t n : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24})
  112. for (size_t k : {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  113. 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29})
  114. args.emplace_back(m, n, k, 0);
  115. matrix_mul::check_matrix_mul(
  116. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  117. "AARCH64_INT8X8X16_MK4_K8X8X8", param::MatrixMul::Format::MK4, 1, 1e-3,
  118. std::move(args));
  119. }
  120. TEST_F(AARCH64, MATRIX_MUL_MK4_8x8x16_4x4) {
  121. matrix_mul::check_matrix_mul(
  122. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  123. "AARCH64_INT8X8X16_MK4_4X4X8", param::MatrixMul::Format::MK4, 1);
  124. }
  125. TEST_F(AARCH64, MATRIX_MUL_MK4_8x8x16) {
  126. matrix_mul::check_matrix_mul(
  127. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  128. "AARCH64_INT8X8X16_MK4_16X12X4", param::MatrixMul::Format::MK4, 1);
  129. }
  130. TEST_F(AARCH64, MATRIX_MUL_INT8x8x32_K8x8x8) {
  131. matrix_mul::check_matrix_mul(
  132. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  133. "AARCH64_INT8X8X32_K8X8X8");
  134. }
  135. #endif
  136. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K8x8x8) {
  137. matrix_mul::check_matrix_mul(
  138. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  139. "AARCH64_INT8X8X16_K8X8X8");
  140. }
  141. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K4x4x16) {
  142. matrix_mul::check_matrix_mul(
  143. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  144. "AARCH64_INT8X8X16_K4X4X16");
  145. }
  146. TEST_F(AARCH64, MATRIX_MUL_INT4x4x16_K8x8x8_QUANTIZEDS4) {
  147. param::MatrixMul param;
  148. param.transposeA = false;
  149. param.transposeB = false;
  150. Checker<MatrixMul> checker(handle());
  151. checker.set_dtype(0, dtype::QuantizedS4{0.6})
  152. .set_dtype(1, dtype::QuantizedS4{0.5})
  153. .set_dtype(2, dtype::QuantizedS16{0.6 * 0.5})
  154. .set_param(param);
  155. checker.set_before_exec_callback(
  156. AlgoChecker<MatrixMul>("AARCH64_INT4X4X16_K8X8X8"));
  157. auto run = [&](size_t M, size_t N, size_t K) {
  158. printf("M N K %zu %zu %zu \n", M, N, K);
  159. TensorShape A, B;
  160. if (param.transposeA) {
  161. A = TensorShape{K, M};
  162. } else {
  163. A = TensorShape{M, K};
  164. }
  165. if (param.transposeB) {
  166. B = TensorShape{N, K};
  167. } else {
  168. B = TensorShape{K, N};
  169. }
  170. checker.exec({A, B, {}});
  171. };
  172. for (size_t m : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 20})
  173. for (size_t n : {2, 4, 6, 8, 10, 12, 14, 16, 24})
  174. for (size_t k : {2, 4, 6, 8, 10, 12, 14, 16, 32})
  175. run(m, n, k);
  176. for (size_t k = 4; k <= 256; k *= 8) {
  177. for (size_t m = 4; m <= 256; m *= 4) {
  178. for (size_t n = 4; n <= 256; n *= 4) {
  179. run(m, n, k);
  180. }
  181. }
  182. }
  183. param.transposeA = true;
  184. run(8, 8, 8);
  185. run(16, 8, 16);
  186. param.transposeB = true;
  187. run(8, 8, 8);
  188. run(16, 16, 16);
  189. }
  190. TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_K12X8X1) {
  191. matrix_mul::check_matrix_mul(
  192. dtype::Int16{}, dtype::Int16{}, dtype::Int32{}, handle(),
  193. "AARCH64_INT16X16X32_K12X8X1");
  194. }
  195. TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_MK8) {
  196. matrix_mul::check_matrix_mul(
  197. dtype::Int16{}, dtype::Int16{}, dtype::Int32{}, handle(),
  198. "AARCH64_INT16X16X32_MK8_8X8", param::MatrixMul::Format::MK8, 1);
  199. }
  200. //! FIXME: need to add tests of GEMV and QUINT8
  201. #if MEGDNN_WITH_BENCHMARK
  202. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K4X16) {
  203. constexpr size_t RUNS = 50;
  204. param::MatrixMul param;
  205. param.transposeA = false;
  206. param.transposeB = false;
  207. Benchmarker<MatrixMul> benchmarker_K4X16(handle());
  208. Benchmarker<MatrixMul> benchmarker_K12X8(handle());
  209. benchmarker_K4X16.set_times(RUNS)
  210. .set_dtype(0, dtype::Float32{})
  211. .set_dtype(1, dtype::Float32{})
  212. .set_dtype(2, dtype::Float32{})
  213. .set_param(param)
  214. .set_display(false);
  215. benchmarker_K4X16.set_before_exec_callback(
  216. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  217. benchmarker_K12X8.set_before_exec_callback(
  218. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  219. benchmarker_K12X8.set_times(RUNS)
  220. .set_dtype(0, dtype::Float32{})
  221. .set_dtype(1, dtype::Float32{})
  222. .set_dtype(2, dtype::Float32{})
  223. .set_param(param)
  224. .set_display(false);
  225. auto run = [&](size_t M, size_t N, size_t K) {
  226. TensorShape A, B;
  227. if (param.transposeA) {
  228. A = TensorShape{K, M};
  229. } else {
  230. A = TensorShape{M, K};
  231. }
  232. if (param.transposeB) {
  233. B = TensorShape{N, K};
  234. } else {
  235. B = TensorShape{K, N};
  236. }
  237. auto k4x16_used = benchmarker_K4X16.exec({A, B, {}}) / RUNS;
  238. auto k12x8_used = benchmarker_K12X8.exec({A, B, {}}) / RUNS;
  239. float computations = 2.f * M * K * N * 1e-6;
  240. printf("run: {%zu{M} %zu{K} %zu{N}} k4x16: %f ms %f Gflops k12x8: %f "
  241. "ms "
  242. "%f Gflops k4x16_vs_k12x8: %f\n",
  243. M, K, N, k4x16_used, computations / k4x16_used, k12x8_used,
  244. computations / k12x8_used, k12x8_used / k4x16_used);
  245. };
  246. run(256, 256, 128);
  247. run(384, 384, 384);
  248. for (size_t k = 4; k <= 256; k *= 8) {
  249. for (size_t m = 4; m <= 256; m *= 4) {
  250. for (size_t n = 4; n <= 256; n *= 4) {
  251. run(m, n, k);
  252. }
  253. printf("\n");
  254. }
  255. printf("\n");
  256. }
  257. }
  258. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_8X8X8) {
  259. constexpr size_t RUNS = 50;
  260. param::MatrixMul param;
  261. param.transposeA = false;
  262. param.transposeB = false;
  263. Benchmarker<MatrixMul> benchmarker_int(handle());
  264. Benchmarker<MatrixMul> benchmarker_int32(handle());
  265. benchmarker_int.set_times(RUNS)
  266. .set_dtype(0, dtype::Int8{})
  267. .set_dtype(1, dtype::Int8{})
  268. .set_dtype(2, dtype::Int16{})
  269. .set_param(param)
  270. .set_display(false);
  271. benchmarker_int.set_before_exec_callback(
  272. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K8X8X8"));
  273. benchmarker_int32.set_before_exec_callback(
  274. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X8X8"));
  275. benchmarker_int32.set_times(RUNS)
  276. .set_dtype(0, dtype::Int8{})
  277. .set_dtype(1, dtype::Int8{})
  278. .set_dtype(2, dtype::Int32{})
  279. .set_param(param)
  280. .set_display(false);
  281. Benchmarker<MatrixMul> benchmarker_float(handle());
  282. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  283. auto run = [&](size_t M, size_t N, size_t K) {
  284. TensorShape A, B;
  285. if (param.transposeA) {
  286. A = TensorShape{K, M};
  287. } else {
  288. A = TensorShape{M, K};
  289. }
  290. if (param.transposeB) {
  291. B = TensorShape{N, K};
  292. } else {
  293. B = TensorShape{K, N};
  294. }
  295. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  296. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  297. auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
  298. float computations = 2.f * M * K * N * 1e-6;
  299. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  300. "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
  301. M, K, N, float_used, computations / float_used, int_used,
  302. computations / int_used, float_used / int_used, int32_used / int_used);
  303. };
  304. run(256, 256, 256);
  305. for (size_t k = 4; k <= 256; k *= 8) {
  306. for (size_t m = 4; m <= 256; m *= 4) {
  307. for (size_t n = 4; n <= 256; n *= 4) {
  308. run(m, n, k);
  309. }
  310. std::cout << std::endl;
  311. }
  312. std::cout << std::endl;
  313. }
  314. }
  315. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT32_MK_4X4X16) {
  316. constexpr size_t RUNS = 50;
  317. param::MatrixMul param;
  318. param.transposeA = false;
  319. param.transposeB = false;
  320. Benchmarker<MatrixMul> benchmarker(handle());
  321. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  322. benchmarker.set_times(RUNS)
  323. .set_dtype(0, dtype::Int8{})
  324. .set_dtype(1, dtype::Int8{})
  325. .set_dtype(2, dtype::Int32{})
  326. .set_param(param)
  327. .set_display(false);
  328. benchmarker.set_before_exec_callback(
  329. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
  330. param.format = MatrixMul::Param::Format::MK4;
  331. benchmarker_mk4.set_before_exec_callback(
  332. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_4X4X16"));
  333. benchmarker_mk4.set_times(RUNS)
  334. .set_dtype(0, dtype::Int8{})
  335. .set_dtype(1, dtype::Int8{})
  336. .set_dtype(2, dtype::Int32{})
  337. .set_param(param)
  338. .set_display(false);
  339. auto run = [&](size_t M, size_t N, size_t K) {
  340. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  341. auto mk_used =
  342. benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
  343. float computations = 2.f * M * K * N * 1e-6;
  344. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  345. "%f Gflops speedup_vs_normal: %f\n",
  346. M, K, N, default_used, computations / default_used, mk_used,
  347. computations / mk_used, default_used / mk_used);
  348. };
  349. run(256, 256, 128);
  350. for (size_t k = 4; k <= 512; k *= 2) {
  351. for (size_t m = 4; m <= 512; m *= 2) {
  352. for (size_t n = 4; n <= 512; n *= 2) {
  353. run(m, n, k);
  354. }
  355. }
  356. std::cout << std::endl;
  357. }
  358. }
  359. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_MK4_8x8x16) {
  360. constexpr size_t RUNS = 50;
  361. param::MatrixMul param;
  362. param.transposeA = false;
  363. param.transposeB = false;
  364. Benchmarker<MatrixMul> benchmarker(handle());
  365. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  366. Benchmarker<MatrixMul> benchmarker_mk4_16x12(handle());
  367. benchmarker.set_times(RUNS)
  368. .set_dtype(0, dtype::Int8{})
  369. .set_dtype(1, dtype::Int8{})
  370. .set_dtype(2, dtype::Int16{})
  371. .set_param(param)
  372. .set_display(false);
  373. benchmarker.set_before_exec_callback(
  374. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  375. param.format = MatrixMul::Param::Format::MK4;
  376. benchmarker_mk4.set_before_exec_callback(
  377. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_4X4X8"));
  378. benchmarker_mk4.set_times(RUNS)
  379. .set_dtype(0, dtype::Int8{})
  380. .set_dtype(1, dtype::Int8{})
  381. .set_dtype(2, dtype::Int16{})
  382. .set_param(param)
  383. .set_display(false);
  384. benchmarker_mk4_16x12.set_before_exec_callback(
  385. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_16X12X4"));
  386. benchmarker_mk4_16x12.set_times(RUNS)
  387. .set_dtype(0, dtype::Int8{})
  388. .set_dtype(1, dtype::Int8{})
  389. .set_dtype(2, dtype::Int16{})
  390. .set_param(param)
  391. .set_display(false);
  392. auto run = [&](size_t M, size_t N, size_t K) {
  393. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  394. auto mk_used =
  395. benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
  396. auto mk4_16x12_used =
  397. benchmarker_mk4_16x12.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  398. RUNS;
  399. float computations = 2.f * M * K * N * 1e-6;
  400. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  401. "%f Gflops speedup: %f, mk4_16x12 %f Gflops speedup: %f\n",
  402. M, K, N, default_used, computations / default_used, mk_used,
  403. computations / mk_used, default_used / mk_used,
  404. computations / mk4_16x12_used, default_used / mk4_16x12_used);
  405. };
  406. run(384, 384, 384);
  407. }
  408. TEST_F(AARCH64, BENCHMARK_4x4x16_vs_8x8x16) {
  409. constexpr size_t RUNS = 50;
  410. param::MatrixMul param;
  411. param.transposeA = false;
  412. param.transposeB = false;
  413. Benchmarker<MatrixMul> benchmarker(handle());
  414. Benchmarker<MatrixMul> benchmarker_int4_4x4x16(handle());
  415. benchmarker_int4_4x4x16.set_times(RUNS)
  416. .set_dtype(0, dtype::QuantizedS4{0.3})
  417. .set_dtype(1, dtype::QuantizedS4{0.3})
  418. .set_dtype(2, dtype::QuantizedS16{0.09})
  419. .set_param(param)
  420. .set_display(false);
  421. benchmarker.set_times(RUNS)
  422. .set_dtype(0, dtype::Int8{})
  423. .set_dtype(1, dtype::Int8{})
  424. .set_dtype(2, dtype::Int16{})
  425. .set_param(param)
  426. .set_display(false);
  427. benchmarker.set_before_exec_callback(
  428. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  429. auto run = [&](size_t M, size_t N, size_t K) {
  430. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  431. auto int4416_used = benchmarker_int4_4x4x16.exec({{M, K}, {K, N}, {}}) / RUNS;
  432. float computations = 2.f * M * K * N * 1e-6;
  433. printf("run: {%zu{M} %zu{K} %zu{N}} normal 8x8x16 used: %f ms %f "
  434. "Gflops int4416 used %f int4416_gflops %f speedup %f\n",
  435. M, K, N, default_used, computations / default_used, int4416_used,
  436. computations / int4416_used, default_used / int4416_used);
  437. };
  438. for (int m = 32; m <= 1024; m += 32)
  439. for (int n = 32; n <= 1024; n += 32)
  440. for (int k = 32; k <= 512; k += 32)
  441. run(m, n, k);
  442. run(32, 32, 32);
  443. run(32, 32, 8);
  444. run(32, 32, 16);
  445. run(32, 32, 24);
  446. run(32 * 2, 32 * 2, 32);
  447. run(32 * 4, 32 * 4, 32);
  448. run(32 * 6, 32 * 6, 32);
  449. run(32 * 8, 32 * 8, 32);
  450. run(32 * 2, 32 * 2, 32 * 2);
  451. run(32 * 4, 32 * 4, 32 * 3);
  452. run(32 * 6, 32 * 6, 32 * 4);
  453. run(32 * 8, 32 * 8, 32 * 5);
  454. run(32 * 10, 32 * 10, 32 * 10);
  455. run(384, 384, 384);
  456. run(256, 256, 384);
  457. run(512, 512, 384);
  458. run(1024, 1024, 384);
  459. }
  460. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_MK4_8x8x8_8x8x16_vs_4x4x16_8x8x16) {
  461. constexpr size_t RUNS = 50;
  462. param::MatrixMul param;
  463. param.transposeA = false;
  464. param.transposeB = false;
  465. Benchmarker<MatrixMul> benchmarker(handle());
  466. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  467. Benchmarker<MatrixMul> benchmarker_mk4_4x4x8(handle());
  468. benchmarker.set_times(RUNS)
  469. .set_dtype(0, dtype::Int8{})
  470. .set_dtype(1, dtype::Int8{})
  471. .set_dtype(2, dtype::Int16{})
  472. .set_param(param)
  473. .set_display(false);
  474. benchmarker.set_before_exec_callback(
  475. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  476. param.format = MatrixMul::Param::Format::MK4;
  477. benchmarker_mk4.set_before_exec_callback(
  478. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_K8X8X8"));
  479. benchmarker_mk4.set_times(RUNS)
  480. .set_dtype(0, dtype::Int8{})
  481. .set_dtype(1, dtype::Int8{})
  482. .set_dtype(2, dtype::Int16{})
  483. .set_param(param)
  484. .set_display(false);
  485. benchmarker_mk4_4x4x8.set_before_exec_callback(
  486. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_4X4X8"));
  487. benchmarker_mk4_4x4x8.set_times(RUNS)
  488. .set_dtype(0, dtype::Int8{})
  489. .set_dtype(1, dtype::Int8{})
  490. .set_dtype(2, dtype::Int16{})
  491. .set_param(param)
  492. .set_display(false);
  493. auto run = [&](size_t M, size_t N, size_t K) {
  494. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  495. auto mk_used =
  496. benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
  497. auto mk4_4x4x8_used =
  498. benchmarker_mk4_4x4x8.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  499. RUNS;
  500. float computations = 2.f * M * K * N * 1e-6;
  501. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  502. "%f Gflops speedup: %f, mk4_4x4x8 %f Gflops %f ms speedup: %f\n",
  503. M, K, N, default_used, computations / default_used, mk_used,
  504. computations / mk_used, default_used / mk_used,
  505. computations / mk4_4x4x8_used, mk4_4x4x8_used, mk4_4x4x8_used / mk_used);
  506. };
  507. run(384, 384, 384);
  508. run(512, 512, 512);
  509. run(1024, 1024, 384);
  510. run(256, 256, 384);
  511. for (int m = 32; m <= 512; m *= 2)
  512. for (int n = 32; n <= 512; n *= 2)
  513. for (int k = 32; k < 512; k *= 2) {
  514. run(m, n, k);
  515. }
  516. }
  517. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_4X4X16) {
  518. constexpr size_t RUNS = 50;
  519. param::MatrixMul param;
  520. param.transposeA = false;
  521. param.transposeB = false;
  522. Benchmarker<MatrixMul> benchmarker_int(handle());
  523. Benchmarker<MatrixMul> benchmarker_int32(handle());
  524. benchmarker_int.set_times(RUNS)
  525. .set_dtype(0, dtype::Int8{})
  526. .set_dtype(1, dtype::Int8{})
  527. .set_dtype(2, dtype::Int16{})
  528. .set_param(param)
  529. .set_display(false);
  530. benchmarker_int.set_before_exec_callback(
  531. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  532. benchmarker_int32.set_before_exec_callback(
  533. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
  534. benchmarker_int32.set_times(RUNS)
  535. .set_dtype(0, dtype::Int8{})
  536. .set_dtype(1, dtype::Int8{})
  537. .set_dtype(2, dtype::Int32{})
  538. .set_param(param)
  539. .set_display(false);
  540. Benchmarker<MatrixMul> benchmarker_float(handle());
  541. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  542. auto run = [&](size_t M, size_t N, size_t K) {
  543. TensorShape A, B;
  544. if (param.transposeA) {
  545. A = TensorShape{K, M};
  546. } else {
  547. A = TensorShape{M, K};
  548. }
  549. if (param.transposeB) {
  550. B = TensorShape{N, K};
  551. } else {
  552. B = TensorShape{K, N};
  553. }
  554. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  555. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  556. auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
  557. float computations = 2.f * M * K * N * 1e-6;
  558. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  559. "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
  560. M, K, N, float_used, computations / float_used, int_used,
  561. computations / int_used, float_used / int_used, int32_used / int_used);
  562. };
  563. run(256, 256, 128);
  564. run(256, 256, 256);
  565. for (size_t k = 4; k <= 256; k *= 4) {
  566. for (size_t m = 4; m <= 256; m *= 4) {
  567. for (size_t n = 4; n <= 256; n *= 4) {
  568. run(m, n, k);
  569. }
  570. }
  571. std::cout << std::endl;
  572. }
  573. }
  574. TEST_F(AARCH64, BENCHMARK_GEMV) {
  575. int exec_times = 10;
  576. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  577. benchmarker_gemm.set_times(exec_times);
  578. float mod = 1000 * exec_times / 1e9;
  579. auto run = [&](size_t M, size_t K, size_t N) {
  580. float time = 1.f, perf = 1.f;
  581. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")" << std::endl;
  582. benchmarker_gemm.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  583. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  584. perf = 2.f * M * K * N / time * mod;
  585. std::cout << "gemm fp32, Performance is " << perf << " Gflops" << std::endl;
  586. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  587. benchmarker_gemm.set_dtype(0, dtype::Float16()).set_dtype(1, dtype::Float16());
  588. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  589. perf = 2.f * M * K * N / time * mod;
  590. std::cout << "gemm fp16, Performance is " << perf << " Gflops" << std::endl;
  591. #endif
  592. };
  593. std::cout << "warm up:\n";
  594. for (int i = 0; i < 50; i++) {
  595. benchmarker_gemm.set_dtype(0, dtype::Float32())
  596. .set_dtype(1, dtype::Float32())
  597. .set_display(false)
  598. .exec({{256, 256}, {256, 256}, {}});
  599. benchmarker_gemm.set_display(true);
  600. }
  601. // run gemv
  602. for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
  603. for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
  604. for (size_t N : {112})
  605. run(M, K, N);
  606. }
  607. #if MGB_ENABLE_DOT
  608. TEST_F(AARCH64, BENCHMARK_TRANSPOSED_MATRIX_MUL_INT_8X8X32) {
  609. constexpr size_t RUNS = 50;
  610. param::MatrixMul param;
  611. param.transposeA = param.transposeB = true;
  612. Benchmarker<MatrixMul> benchmarker_int(handle());
  613. benchmarker_int.set_times(RUNS)
  614. .set_dtype(0, dtype::Int8{})
  615. .set_dtype(1, dtype::Int8{})
  616. .set_dtype(2, {})
  617. .set_param(param)
  618. .set_display(false);
  619. Benchmarker<MatrixMul> benchmarker_float(handle());
  620. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  621. auto run = [&](size_t M, size_t N, size_t K) {
  622. auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
  623. auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
  624. float computations = 2.f * M * K * N * 1e-6;
  625. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  626. "%f Gflops speedup: %f\n",
  627. M, K, N, float_used, computations / float_used, int_used,
  628. computations / int_used, float_used / int_used);
  629. };
  630. run(256, 12 * 24, 256);
  631. for (size_t M : {8, 64, 112, 256}) {
  632. for (size_t K : {8, 64, 112, 256}) {
  633. for (size_t N : {8, 64, 112, 256}) {
  634. run(M, N, K);
  635. }
  636. }
  637. }
  638. }
  639. TEST_F(AARCH64, BENCHMARK_GEMV_INT_8X8X32) {
  640. constexpr size_t RUNS = 50;
  641. param::MatrixMul param;
  642. Benchmarker<MatrixMul> benchmarker_int(handle());
  643. benchmarker_int.set_times(RUNS)
  644. .set_dtype(0, dtype::Int8{})
  645. .set_dtype(1, dtype::Int8{})
  646. .set_dtype(2, {})
  647. .set_param(param)
  648. .set_display(false);
  649. Benchmarker<MatrixMul> benchmarker_float(handle());
  650. benchmarker_float.set_display(false).set_times(RUNS);
  651. auto run = [&](size_t M, size_t N, size_t K) {
  652. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  653. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  654. float computations = 2.f * M * K * N * 1e-6;
  655. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  656. "%f Gflops speedup: %f\n",
  657. M, K, N, float_used, computations / float_used, int_used,
  658. computations / int_used, float_used / int_used);
  659. };
  660. for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  661. for (size_t N : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  662. for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  663. run(M, N, K);
  664. }
  665. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT8X8X32_MK4_8X12X4) {
  666. constexpr size_t RUNS = 50;
  667. param::MatrixMul param;
  668. param.transposeA = false;
  669. param.transposeB = false;
  670. Benchmarker<MatrixMul> benchmarker(handle());
  671. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  672. benchmarker.set_times(RUNS)
  673. .set_dtype(0, dtype::Int8{})
  674. .set_dtype(1, dtype::Int8{})
  675. .set_dtype(2, dtype::Int32{})
  676. .set_param(param)
  677. .set_display(false);
  678. benchmarker.set_before_exec_callback(
  679. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X12X4"));
  680. param.format = MatrixMul::Param::Format::MK4_DOT;
  681. benchmarker_mk4.set_before_exec_callback(
  682. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD"));
  683. benchmarker_mk4.set_times(RUNS)
  684. .set_dtype(0, dtype::Int8{})
  685. .set_dtype(1, dtype::Int8{})
  686. .set_dtype(2, dtype::Int32{})
  687. .set_param(param)
  688. .set_display(false);
  689. auto run = [&](size_t M, size_t N, size_t K) {
  690. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  691. auto mk_used =
  692. benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
  693. float computations = 2.f * M * K * N * 1e-6;
  694. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  695. "%f Gflops speedup_vs_normal: %f\n",
  696. M, K, N, default_used, computations / default_used, mk_used,
  697. computations / mk_used, default_used / mk_used);
  698. };
  699. run(256, 256, 128);
  700. for (size_t k = 4; k <= 512; k *= 2) {
  701. for (size_t m = 4; m <= 512; m *= 2) {
  702. for (size_t n = 4; n <= 512; n *= 2) {
  703. run(m, n, k);
  704. }
  705. }
  706. std::cout << std::endl;
  707. }
  708. }
  709. #endif // MGB_ENABLE_DOT
  710. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  711. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_F16_MK8) {
  712. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  713. matrix_mul::benchmark_with_contrast(
  714. handle(), args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{},
  715. "AARCH64_F16_MK8_8X8", param::MatrixMul::Format::MK8, dtype::Float16{},
  716. dtype::Float16{}, dtype::Float16{}, "AARCH64_F16_K8X24X1");
  717. }
  718. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_F16_MK8_16x12) {
  719. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  720. matrix_mul::benchmark_with_contrast(
  721. handle(), args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{},
  722. "AARCH64_F16_MK8_16X12X1", param::MatrixMul::Format::MK8, dtype::Float16{},
  723. dtype::Float16{}, dtype::Float16{}, "AARCH64_F16_K8X24X1");
  724. }
  725. #endif
  726. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32) {
  727. constexpr size_t RUNS = 50;
  728. Benchmarker<MatrixMul> benchmarker_int(handle());
  729. benchmarker_int.set_times(RUNS)
  730. .set_dtype(0, dtype::Int16{})
  731. .set_dtype(1, dtype::Int16{})
  732. .set_dtype(2, dtype::Int32{})
  733. .set_display(false);
  734. Benchmarker<MatrixMul> benchmarker_float(handle());
  735. benchmarker_float.set_display(false).set_times(RUNS);
  736. auto run = [&](size_t M, size_t N, size_t K, int mask) {
  737. param::MatrixMul param;
  738. param.transposeA = mask & 0x1;
  739. param.transposeB = mask & 0x2;
  740. benchmarker_int.set_param(param);
  741. benchmarker_float.set_param(param);
  742. TensorShape A, B;
  743. if (param.transposeA) {
  744. A = TensorShape{K, M};
  745. } else {
  746. A = TensorShape{M, K};
  747. }
  748. if (param.transposeB) {
  749. B = TensorShape{N, K};
  750. } else {
  751. B = TensorShape{K, N};
  752. }
  753. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  754. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  755. float computations = 2.f * M * K * N * 1e-6;
  756. printf("run: {%zu{M} %zu{K} %zu{N} %d{TA} %d{TB}} "
  757. "float: %f ms %f Gflops int: %f ms "
  758. "%f Gflops speedup: %f\n",
  759. M, K, N, param.transposeA, param.transposeB, float_used,
  760. computations / float_used, int_used, computations / int_used,
  761. float_used / int_used);
  762. };
  763. constexpr int mask = 4;
  764. for (auto i = 0; i < mask; i++) {
  765. for (size_t M : {8, 64, 112, 256}) {
  766. for (size_t K : {8, 64, 112, 256}) {
  767. for (size_t N : {8, 64, 112, 256}) {
  768. run(M, N, K, i);
  769. }
  770. }
  771. }
  772. }
  773. }
  774. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_MK4) {
  775. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
  776. matrix_mul::benchmark_with_contrast(
  777. handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
  778. "AARCH64_F32_MK4_4x16", param::MatrixMul::Format::MK4, dtype::Float32{},
  779. dtype::Float32{}, dtype::Float32{});
  780. }
  781. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_PACK_MK4) {
  782. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
  783. matrix_mul::benchmark_with_contrast(
  784. handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
  785. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, dtype::Float32{},
  786. dtype::Float32{}, dtype::Float32{}, "AARCH64_F32K8X12X1");
  787. }
  788. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
  789. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  790. matrix_mul::benchmark_with_contrast(
  791. handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  792. "AARCH64_INT16X16X32_MK8_8X8", param::MatrixMul::Format::MK8,
  793. dtype::Int16{}, dtype::Int16{}, dtype::Int32{});
  794. }
  795. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12) {
  796. constexpr size_t RUNS = 50;
  797. param::MatrixMul param;
  798. param.transposeA = param.transposeB = true;
  799. Benchmarker<MatrixMul> benchmarker_k12x8(handle());
  800. Benchmarker<MatrixMul> benchmarker_k8x12(handle());
  801. benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
  802. benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
  803. benchmarker_k12x8.set_before_exec_callback(
  804. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  805. benchmarker_k8x12.set_before_exec_callback(
  806. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  807. auto run = [&](size_t M, size_t N, size_t K) {
  808. auto k12x8_used = benchmarker_k12x8.exec({{K, M}, {N, K}, {}}) / RUNS;
  809. auto k8x12_used = benchmarker_k8x12.exec({{K, M}, {N, K}, {}}) / RUNS;
  810. float computations = 2.f * M * K * N * 1e-6;
  811. printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
  812. "k8x12: %f ms "
  813. "%f Gflops speedup: %f\n",
  814. M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
  815. computations / k8x12_used, k12x8_used / k8x12_used);
  816. };
  817. run(256, 12 * 24, 256);
  818. for (size_t M : {8, 64, 112, 256}) {
  819. for (size_t K : {8, 64, 112, 256}) {
  820. for (size_t N : {8, 64, 112, 256}) {
  821. run(M, N, K);
  822. }
  823. }
  824. }
  825. }
  826. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12_NO_TRANS) {
  827. constexpr size_t RUNS = 50;
  828. param::MatrixMul param;
  829. param.transposeA = param.transposeB = false;
  830. Benchmarker<MatrixMul> benchmarker_k12x8(handle());
  831. Benchmarker<MatrixMul> benchmarker_k8x12(handle());
  832. benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
  833. benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
  834. benchmarker_k12x8.set_before_exec_callback(
  835. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  836. benchmarker_k8x12.set_before_exec_callback(
  837. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  838. auto run = [&](size_t M, size_t N, size_t K) {
  839. auto k12x8_used = benchmarker_k12x8.exec({{M, K}, {K, N}, {}}) / RUNS;
  840. auto k8x12_used = benchmarker_k8x12.exec({{M, K}, {K, N}, {}}) / RUNS;
  841. float computations = 2.f * M * K * N * 1e-6;
  842. printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
  843. "k8x12: %f ms "
  844. "%f Gflops speedup: %f\n",
  845. M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
  846. computations / k8x12_used, k12x8_used / k8x12_used);
  847. };
  848. run(256, 12 * 24, 256);
  849. for (size_t M : {8, 64, 112, 256}) {
  850. for (size_t K : {8, 64, 112, 256}) {
  851. for (size_t N : {8, 64, 112, 256}) {
  852. run(M, N, K);
  853. }
  854. }
  855. }
  856. }
  857. #endif // MEGDNN_WITH_BENCHMARK
  858. // vim: syntax=cpp.doxygen