You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944
  1. #include "test/aarch64/fixture.h"
  2. #include "test/common/benchmarker.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/matrix_mul.h"
  5. #include "test/common/rng.h"
  6. #include "test/arm_common/cpuinfo_help.h"
  7. using namespace megdnn;
  8. using namespace test;
  9. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12) {
  10. matrix_mul::check_matrix_mul(
  11. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  12. "AARCH64_F32K8X12X1");
  13. }
  14. #if MGB_ENABLE_CPUINFO
  15. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12_A53) {
  16. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a53);
  17. matrix_mul::check_matrix_mul(
  18. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  19. "AARCH64_F32K8X12X1");
  20. }
  21. TEST_F(AARCH64, MATRIX_MUL_FP32K8X12_A55) {
  22. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a55);
  23. matrix_mul::check_matrix_mul(
  24. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  25. "AARCH64_F32K8X12X1");
  26. }
  27. #endif
  28. TEST_F(AARCH64, MATRIX_MUL_FP32K4X16) {
  29. matrix_mul::check_matrix_mul(
  30. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  31. "AARCH64_F32K4X16X1");
  32. }
  33. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4) {
  34. matrix_mul::check_matrix_mul(
  35. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  36. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  37. }
  38. #if MGB_ENABLE_CPUINFO
  39. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4_A53) {
  40. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a53);
  41. matrix_mul::check_matrix_mul(
  42. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  43. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  44. }
  45. TEST_F(AARCH64, MATRIX_MUL_FP32_PACK_MK4_A55) {
  46. CpuInfoTmpReplace cpu_replace_guard(cpuinfo_uarch_cortex_a55);
  47. matrix_mul::check_matrix_mul(
  48. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  49. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, 1);
  50. }
  51. #endif
  52. TEST_F(AARCH64, MATRIX_MUL_FP32_MK4) {
  53. matrix_mul::check_matrix_mul(
  54. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  55. "AARCH64_F32_MK4_4x16", param::MatrixMul::Format::MK4, 1);
  56. }
  57. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  58. TEST_F(AARCH64, MATRIX_MUL_F16_K8X24X1) {
  59. matrix_mul::check_matrix_mul(
  60. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  61. "AARCH64_F16_K8X24X1");
  62. }
  63. TEST_F(AARCH64, MATRIX_MUL_F16_MK8) {
  64. matrix_mul::check_matrix_mul(
  65. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  66. "AARCH64_F16_MK8_8X8", param::MatrixMul::Format::MK8, 1);
  67. }
  68. #endif
  69. #if MGB_ENABLE_DOT
  70. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K8X12X4_DOTPROD) {
  71. matrix_mul::check_matrix_mul(
  72. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  73. "AARCH64_INT8X8X32_K8X12X4_DOTPROD");
  74. }
  75. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_MK4_8X12X4_DOTPROD) {
  76. std::vector<matrix_mul::TestArg> args;
  77. for (size_t m : {1, 2, 3, 4, 5, 6, 7, 10, 11})
  78. for (size_t n : {2, 3, 4, 5, 8, 12, 13, 14, 15, 16, 31})
  79. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  80. args.emplace_back(m, n, k, 0);
  81. matrix_mul::check_matrix_mul(
  82. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  83. "AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD", param::MatrixMul::Format::MK4_DOT,
  84. 1, 1e-3, std::move(args));
  85. }
  86. #else
  87. TEST_F(AARCH64, MATRIX_MUL_INT8X8X32_K4X4X16) {
  88. matrix_mul::check_matrix_mul(
  89. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  90. "AARCH64_INT8X8X32_K4X4X16");
  91. }
  92. TEST_F(AARCH64, MATRIX_MUL_INT8_MK4) {
  93. std::vector<matrix_mul::TestArg> args;
  94. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  95. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  96. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  97. args.emplace_back(m, n, k, 0);
  98. matrix_mul::check_matrix_mul(
  99. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  100. "AARCH64_INT8X8X32_MK4_4X4X16", param::MatrixMul::Format::MK4, 1, 1e-3,
  101. std::move(args));
  102. }
  103. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_MK4) {
  104. std::vector<matrix_mul::TestArg> args;
  105. for (size_t m : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17})
  106. for (size_t n : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 24})
  107. for (size_t k : {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
  108. 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29})
  109. args.emplace_back(m, n, k, 0);
  110. matrix_mul::check_matrix_mul(
  111. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  112. "AARCH64_INT8X8X16_MK4_K8X8X8", param::MatrixMul::Format::MK4, 1, 1e-3,
  113. std::move(args));
  114. }
  115. TEST_F(AARCH64, MATRIX_MUL_MK4_8x8x16_4x4) {
  116. matrix_mul::check_matrix_mul(
  117. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  118. "AARCH64_INT8X8X16_MK4_4X4X8", param::MatrixMul::Format::MK4, 1);
  119. }
  120. TEST_F(AARCH64, MATRIX_MUL_MK4_8x8x16) {
  121. matrix_mul::check_matrix_mul(
  122. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  123. "AARCH64_INT8X8X16_MK4_16X12X4", param::MatrixMul::Format::MK4, 1);
  124. }
  125. TEST_F(AARCH64, MATRIX_MUL_INT8x8x32_K8x8x8) {
  126. matrix_mul::check_matrix_mul(
  127. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  128. "AARCH64_INT8X8X32_K8X8X8");
  129. }
  130. #endif
  131. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K8x8x8) {
  132. matrix_mul::check_matrix_mul(
  133. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  134. "AARCH64_INT8X8X16_K8X8X8");
  135. }
  136. TEST_F(AARCH64, MATRIX_MUL_INT8x8x16_K4x4x16) {
  137. matrix_mul::check_matrix_mul(
  138. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  139. "AARCH64_INT8X8X16_K4X4X16");
  140. }
  141. TEST_F(AARCH64, MATRIX_MUL_INT4x4x16_K8x8x8_QUANTIZEDS4) {
  142. param::MatrixMul param;
  143. param.transposeA = false;
  144. param.transposeB = false;
  145. Checker<MatrixMul> checker(handle());
  146. checker.set_dtype(0, dtype::QuantizedS4{0.6})
  147. .set_dtype(1, dtype::QuantizedS4{0.5})
  148. .set_dtype(2, dtype::QuantizedS16{0.6 * 0.5})
  149. .set_param(param);
  150. checker.set_before_exec_callback(
  151. AlgoChecker<MatrixMul>("AARCH64_INT4X4X16_K8X8X8"));
  152. auto run = [&](size_t M, size_t N, size_t K) {
  153. printf("M N K %zu %zu %zu \n", M, N, K);
  154. TensorShape A, B;
  155. if (param.transposeA) {
  156. A = TensorShape{K, M};
  157. } else {
  158. A = TensorShape{M, K};
  159. }
  160. if (param.transposeB) {
  161. B = TensorShape{N, K};
  162. } else {
  163. B = TensorShape{K, N};
  164. }
  165. checker.exec({A, B, {}});
  166. };
  167. for (size_t m : {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 16, 20})
  168. for (size_t n : {2, 4, 6, 8, 10, 12, 14, 16, 24})
  169. for (size_t k : {2, 4, 6, 8, 10, 12, 14, 16, 32})
  170. run(m, n, k);
  171. for (size_t k = 4; k <= 256; k *= 8) {
  172. for (size_t m = 4; m <= 256; m *= 4) {
  173. for (size_t n = 4; n <= 256; n *= 4) {
  174. run(m, n, k);
  175. }
  176. }
  177. }
  178. param.transposeA = true;
  179. run(8, 8, 8);
  180. run(16, 8, 16);
  181. param.transposeB = true;
  182. run(8, 8, 8);
  183. run(16, 16, 16);
  184. }
  185. TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_K12X8X1) {
  186. matrix_mul::check_matrix_mul(
  187. dtype::Int16{}, dtype::Int16{}, dtype::Int32{}, handle(),
  188. "AARCH64_INT16X16X32_K12X8X1");
  189. }
  190. TEST_F(AARCH64, MATRIX_MUL_INT16x16x32_MK8) {
  191. matrix_mul::check_matrix_mul(
  192. dtype::Int16{}, dtype::Int16{}, dtype::Int32{}, handle(),
  193. "AARCH64_INT16X16X32_MK8_8X8", param::MatrixMul::Format::MK8, 1);
  194. }
  195. //! FIXME: need to add tests of GEMV and QUINT8
  196. #if MEGDNN_WITH_BENCHMARK
  197. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K4X16) {
  198. constexpr size_t RUNS = 50;
  199. param::MatrixMul param;
  200. param.transposeA = false;
  201. param.transposeB = false;
  202. Benchmarker<MatrixMul> benchmarker_K4X16(handle());
  203. Benchmarker<MatrixMul> benchmarker_K12X8(handle());
  204. benchmarker_K4X16.set_times(RUNS)
  205. .set_dtype(0, dtype::Float32{})
  206. .set_dtype(1, dtype::Float32{})
  207. .set_dtype(2, dtype::Float32{})
  208. .set_param(param)
  209. .set_display(false);
  210. benchmarker_K4X16.set_before_exec_callback(
  211. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  212. benchmarker_K12X8.set_before_exec_callback(
  213. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  214. benchmarker_K12X8.set_times(RUNS)
  215. .set_dtype(0, dtype::Float32{})
  216. .set_dtype(1, dtype::Float32{})
  217. .set_dtype(2, dtype::Float32{})
  218. .set_param(param)
  219. .set_display(false);
  220. auto run = [&](size_t M, size_t N, size_t K) {
  221. TensorShape A, B;
  222. if (param.transposeA) {
  223. A = TensorShape{K, M};
  224. } else {
  225. A = TensorShape{M, K};
  226. }
  227. if (param.transposeB) {
  228. B = TensorShape{N, K};
  229. } else {
  230. B = TensorShape{K, N};
  231. }
  232. auto k4x16_used = benchmarker_K4X16.exec({A, B, {}}) / RUNS;
  233. auto k12x8_used = benchmarker_K12X8.exec({A, B, {}}) / RUNS;
  234. float computations = 2.f * M * K * N * 1e-6;
  235. printf("run: {%zu{M} %zu{K} %zu{N}} k4x16: %f ms %f Gflops k12x8: %f "
  236. "ms "
  237. "%f Gflops k4x16_vs_k12x8: %f\n",
  238. M, K, N, k4x16_used, computations / k4x16_used, k12x8_used,
  239. computations / k12x8_used, k12x8_used / k4x16_used);
  240. };
  241. run(256, 256, 128);
  242. run(384, 384, 384);
  243. for (size_t k = 4; k <= 256; k *= 8) {
  244. for (size_t m = 4; m <= 256; m *= 4) {
  245. for (size_t n = 4; n <= 256; n *= 4) {
  246. run(m, n, k);
  247. }
  248. printf("\n");
  249. }
  250. printf("\n");
  251. }
  252. }
  253. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_8X8X8) {
  254. constexpr size_t RUNS = 50;
  255. param::MatrixMul param;
  256. param.transposeA = false;
  257. param.transposeB = false;
  258. Benchmarker<MatrixMul> benchmarker_int(handle());
  259. Benchmarker<MatrixMul> benchmarker_int32(handle());
  260. benchmarker_int.set_times(RUNS)
  261. .set_dtype(0, dtype::Int8{})
  262. .set_dtype(1, dtype::Int8{})
  263. .set_dtype(2, dtype::Int16{})
  264. .set_param(param)
  265. .set_display(false);
  266. benchmarker_int.set_before_exec_callback(
  267. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K8X8X8"));
  268. benchmarker_int32.set_before_exec_callback(
  269. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X8X8"));
  270. benchmarker_int32.set_times(RUNS)
  271. .set_dtype(0, dtype::Int8{})
  272. .set_dtype(1, dtype::Int8{})
  273. .set_dtype(2, dtype::Int32{})
  274. .set_param(param)
  275. .set_display(false);
  276. Benchmarker<MatrixMul> benchmarker_float(handle());
  277. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  278. auto run = [&](size_t M, size_t N, size_t K) {
  279. TensorShape A, B;
  280. if (param.transposeA) {
  281. A = TensorShape{K, M};
  282. } else {
  283. A = TensorShape{M, K};
  284. }
  285. if (param.transposeB) {
  286. B = TensorShape{N, K};
  287. } else {
  288. B = TensorShape{K, N};
  289. }
  290. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  291. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  292. auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
  293. float computations = 2.f * M * K * N * 1e-6;
  294. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  295. "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
  296. M, K, N, float_used, computations / float_used, int_used,
  297. computations / int_used, float_used / int_used, int32_used / int_used);
  298. };
  299. run(256, 256, 256);
  300. for (size_t k = 4; k <= 256; k *= 8) {
  301. for (size_t m = 4; m <= 256; m *= 4) {
  302. for (size_t n = 4; n <= 256; n *= 4) {
  303. run(m, n, k);
  304. }
  305. std::cout << std::endl;
  306. }
  307. std::cout << std::endl;
  308. }
  309. }
  310. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT32_MK_4X4X16) {
  311. constexpr size_t RUNS = 50;
  312. param::MatrixMul param;
  313. param.transposeA = false;
  314. param.transposeB = false;
  315. Benchmarker<MatrixMul> benchmarker(handle());
  316. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  317. benchmarker.set_times(RUNS)
  318. .set_dtype(0, dtype::Int8{})
  319. .set_dtype(1, dtype::Int8{})
  320. .set_dtype(2, dtype::Int32{})
  321. .set_param(param)
  322. .set_display(false);
  323. benchmarker.set_before_exec_callback(
  324. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
  325. param.format = MatrixMul::Param::Format::MK4;
  326. benchmarker_mk4.set_before_exec_callback(
  327. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_4X4X16"));
  328. benchmarker_mk4.set_times(RUNS)
  329. .set_dtype(0, dtype::Int8{})
  330. .set_dtype(1, dtype::Int8{})
  331. .set_dtype(2, dtype::Int32{})
  332. .set_param(param)
  333. .set_display(false);
  334. auto run = [&](size_t M, size_t N, size_t K) {
  335. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  336. auto mk_used =
  337. benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
  338. float computations = 2.f * M * K * N * 1e-6;
  339. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  340. "%f Gflops speedup_vs_normal: %f\n",
  341. M, K, N, default_used, computations / default_used, mk_used,
  342. computations / mk_used, default_used / mk_used);
  343. };
  344. run(256, 256, 128);
  345. for (size_t k = 4; k <= 512; k *= 2) {
  346. for (size_t m = 4; m <= 512; m *= 2) {
  347. for (size_t n = 4; n <= 512; n *= 2) {
  348. run(m, n, k);
  349. }
  350. }
  351. std::cout << std::endl;
  352. }
  353. }
  354. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_MK4_8x8x16) {
  355. constexpr size_t RUNS = 50;
  356. param::MatrixMul param;
  357. param.transposeA = false;
  358. param.transposeB = false;
  359. Benchmarker<MatrixMul> benchmarker(handle());
  360. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  361. Benchmarker<MatrixMul> benchmarker_mk4_16x12(handle());
  362. benchmarker.set_times(RUNS)
  363. .set_dtype(0, dtype::Int8{})
  364. .set_dtype(1, dtype::Int8{})
  365. .set_dtype(2, dtype::Int16{})
  366. .set_param(param)
  367. .set_display(false);
  368. benchmarker.set_before_exec_callback(
  369. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  370. param.format = MatrixMul::Param::Format::MK4;
  371. benchmarker_mk4.set_before_exec_callback(
  372. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_4X4X8"));
  373. benchmarker_mk4.set_times(RUNS)
  374. .set_dtype(0, dtype::Int8{})
  375. .set_dtype(1, dtype::Int8{})
  376. .set_dtype(2, dtype::Int16{})
  377. .set_param(param)
  378. .set_display(false);
  379. benchmarker_mk4_16x12.set_before_exec_callback(
  380. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_16X12X4"));
  381. benchmarker_mk4_16x12.set_times(RUNS)
  382. .set_dtype(0, dtype::Int8{})
  383. .set_dtype(1, dtype::Int8{})
  384. .set_dtype(2, dtype::Int16{})
  385. .set_param(param)
  386. .set_display(false);
  387. auto run = [&](size_t M, size_t N, size_t K) {
  388. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  389. auto mk_used =
  390. benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
  391. auto mk4_16x12_used =
  392. benchmarker_mk4_16x12.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  393. RUNS;
  394. float computations = 2.f * M * K * N * 1e-6;
  395. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  396. "%f Gflops speedup: %f, mk4_16x12 %f Gflops speedup: %f\n",
  397. M, K, N, default_used, computations / default_used, mk_used,
  398. computations / mk_used, default_used / mk_used,
  399. computations / mk4_16x12_used, default_used / mk4_16x12_used);
  400. };
  401. run(384, 384, 384);
  402. }
  403. TEST_F(AARCH64, BENCHMARK_4x4x16_vs_8x8x16) {
  404. constexpr size_t RUNS = 50;
  405. param::MatrixMul param;
  406. param.transposeA = false;
  407. param.transposeB = false;
  408. Benchmarker<MatrixMul> benchmarker(handle());
  409. Benchmarker<MatrixMul> benchmarker_int4_4x4x16(handle());
  410. benchmarker_int4_4x4x16.set_times(RUNS)
  411. .set_dtype(0, dtype::QuantizedS4{0.3})
  412. .set_dtype(1, dtype::QuantizedS4{0.3})
  413. .set_dtype(2, dtype::QuantizedS16{0.09})
  414. .set_param(param)
  415. .set_display(false);
  416. benchmarker.set_times(RUNS)
  417. .set_dtype(0, dtype::Int8{})
  418. .set_dtype(1, dtype::Int8{})
  419. .set_dtype(2, dtype::Int16{})
  420. .set_param(param)
  421. .set_display(false);
  422. benchmarker.set_before_exec_callback(
  423. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  424. auto run = [&](size_t M, size_t N, size_t K) {
  425. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  426. auto int4416_used = benchmarker_int4_4x4x16.exec({{M, K}, {K, N}, {}}) / RUNS;
  427. float computations = 2.f * M * K * N * 1e-6;
  428. printf("run: {%zu{M} %zu{K} %zu{N}} normal 8x8x16 used: %f ms %f "
  429. "Gflops int4416 used %f int4416_gflops %f speedup %f\n",
  430. M, K, N, default_used, computations / default_used, int4416_used,
  431. computations / int4416_used, default_used / int4416_used);
  432. };
  433. for (int m = 32; m <= 1024; m += 32)
  434. for (int n = 32; n <= 1024; n += 32)
  435. for (int k = 32; k <= 512; k += 32)
  436. run(m, n, k);
  437. run(32, 32, 32);
  438. run(32, 32, 8);
  439. run(32, 32, 16);
  440. run(32, 32, 24);
  441. run(32 * 2, 32 * 2, 32);
  442. run(32 * 4, 32 * 4, 32);
  443. run(32 * 6, 32 * 6, 32);
  444. run(32 * 8, 32 * 8, 32);
  445. run(32 * 2, 32 * 2, 32 * 2);
  446. run(32 * 4, 32 * 4, 32 * 3);
  447. run(32 * 6, 32 * 6, 32 * 4);
  448. run(32 * 8, 32 * 8, 32 * 5);
  449. run(32 * 10, 32 * 10, 32 * 10);
  450. run(384, 384, 384);
  451. run(256, 256, 384);
  452. run(512, 512, 384);
  453. run(1024, 1024, 384);
  454. }
  455. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_MK4_8x8x8_8x8x16_vs_4x4x16_8x8x16) {
  456. constexpr size_t RUNS = 50;
  457. param::MatrixMul param;
  458. param.transposeA = false;
  459. param.transposeB = false;
  460. Benchmarker<MatrixMul> benchmarker(handle());
  461. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  462. Benchmarker<MatrixMul> benchmarker_mk4_4x4x8(handle());
  463. benchmarker.set_times(RUNS)
  464. .set_dtype(0, dtype::Int8{})
  465. .set_dtype(1, dtype::Int8{})
  466. .set_dtype(2, dtype::Int16{})
  467. .set_param(param)
  468. .set_display(false);
  469. benchmarker.set_before_exec_callback(
  470. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  471. param.format = MatrixMul::Param::Format::MK4;
  472. benchmarker_mk4.set_before_exec_callback(
  473. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_K8X8X8"));
  474. benchmarker_mk4.set_times(RUNS)
  475. .set_dtype(0, dtype::Int8{})
  476. .set_dtype(1, dtype::Int8{})
  477. .set_dtype(2, dtype::Int16{})
  478. .set_param(param)
  479. .set_display(false);
  480. benchmarker_mk4_4x4x8.set_before_exec_callback(
  481. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_MK4_4X4X8"));
  482. benchmarker_mk4_4x4x8.set_times(RUNS)
  483. .set_dtype(0, dtype::Int8{})
  484. .set_dtype(1, dtype::Int8{})
  485. .set_dtype(2, dtype::Int16{})
  486. .set_param(param)
  487. .set_display(false);
  488. auto run = [&](size_t M, size_t N, size_t K) {
  489. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  490. auto mk_used =
  491. benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
  492. auto mk4_4x4x8_used =
  493. benchmarker_mk4_4x4x8.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  494. RUNS;
  495. float computations = 2.f * M * K * N * 1e-6;
  496. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  497. "%f Gflops speedup: %f, mk4_4x4x8 %f Gflops %f ms speedup: %f\n",
  498. M, K, N, default_used, computations / default_used, mk_used,
  499. computations / mk_used, default_used / mk_used,
  500. computations / mk4_4x4x8_used, mk4_4x4x8_used, mk4_4x4x8_used / mk_used);
  501. };
  502. run(384, 384, 384);
  503. run(512, 512, 512);
  504. run(1024, 1024, 384);
  505. run(256, 256, 384);
  506. for (int m = 32; m <= 512; m *= 2)
  507. for (int n = 32; n <= 512; n *= 2)
  508. for (int k = 32; k < 512; k *= 2) {
  509. run(m, n, k);
  510. }
  511. }
  512. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16_4X4X16) {
  513. constexpr size_t RUNS = 50;
  514. param::MatrixMul param;
  515. param.transposeA = false;
  516. param.transposeB = false;
  517. Benchmarker<MatrixMul> benchmarker_int(handle());
  518. Benchmarker<MatrixMul> benchmarker_int32(handle());
  519. benchmarker_int.set_times(RUNS)
  520. .set_dtype(0, dtype::Int8{})
  521. .set_dtype(1, dtype::Int8{})
  522. .set_dtype(2, dtype::Int16{})
  523. .set_param(param)
  524. .set_display(false);
  525. benchmarker_int.set_before_exec_callback(
  526. AlgoChecker<MatrixMul>("AARCH64_INT8X8X16_K4X4X16"));
  527. benchmarker_int32.set_before_exec_callback(
  528. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K4X4X16"));
  529. benchmarker_int32.set_times(RUNS)
  530. .set_dtype(0, dtype::Int8{})
  531. .set_dtype(1, dtype::Int8{})
  532. .set_dtype(2, dtype::Int32{})
  533. .set_param(param)
  534. .set_display(false);
  535. Benchmarker<MatrixMul> benchmarker_float(handle());
  536. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  537. auto run = [&](size_t M, size_t N, size_t K) {
  538. TensorShape A, B;
  539. if (param.transposeA) {
  540. A = TensorShape{K, M};
  541. } else {
  542. A = TensorShape{M, K};
  543. }
  544. if (param.transposeB) {
  545. B = TensorShape{N, K};
  546. } else {
  547. B = TensorShape{K, N};
  548. }
  549. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  550. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  551. auto int32_used = benchmarker_int32.exec({A, B, {}}) / RUNS;
  552. float computations = 2.f * M * K * N * 1e-6;
  553. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  554. "%f Gflops speedup_vs_fp32: %f, speedup_vs_int32: %f\n",
  555. M, K, N, float_used, computations / float_used, int_used,
  556. computations / int_used, float_used / int_used, int32_used / int_used);
  557. };
  558. run(256, 256, 128);
  559. run(256, 256, 256);
  560. for (size_t k = 4; k <= 256; k *= 4) {
  561. for (size_t m = 4; m <= 256; m *= 4) {
  562. for (size_t n = 4; n <= 256; n *= 4) {
  563. run(m, n, k);
  564. }
  565. }
  566. std::cout << std::endl;
  567. }
  568. }
  569. TEST_F(AARCH64, BENCHMARK_GEMV) {
  570. int exec_times = 10;
  571. Benchmarker<MatrixMul> benchmarker_gemm(handle());
  572. benchmarker_gemm.set_times(exec_times);
  573. float mod = 1000 * exec_times / 1e9;
  574. auto run = [&](size_t M, size_t K, size_t N) {
  575. float time = 1.f, perf = 1.f;
  576. std::cout << "GEMM: (" << M << ", " << K << ", " << N << ")" << std::endl;
  577. benchmarker_gemm.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  578. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  579. perf = 2.f * M * K * N / time * mod;
  580. std::cout << "gemm fp32, Performance is " << perf << " Gflops" << std::endl;
  581. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  582. benchmarker_gemm.set_dtype(0, dtype::Float16()).set_dtype(1, dtype::Float16());
  583. time = benchmarker_gemm.exec({{M, K}, {K, N}, {}});
  584. perf = 2.f * M * K * N / time * mod;
  585. std::cout << "gemm fp16, Performance is " << perf << " Gflops" << std::endl;
  586. #endif
  587. };
  588. std::cout << "warm up:\n";
  589. for (int i = 0; i < 50; i++) {
  590. benchmarker_gemm.set_dtype(0, dtype::Float32())
  591. .set_dtype(1, dtype::Float32())
  592. .set_display(false)
  593. .exec({{256, 256}, {256, 256}, {}});
  594. benchmarker_gemm.set_display(true);
  595. }
  596. // run gemv
  597. for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
  598. for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 64, 256})
  599. for (size_t N : {112})
  600. run(M, K, N);
  601. }
  602. #if MGB_ENABLE_DOT
  603. TEST_F(AARCH64, BENCHMARK_TRANSPOSED_MATRIX_MUL_INT_8X8X32) {
  604. constexpr size_t RUNS = 50;
  605. param::MatrixMul param;
  606. param.transposeA = param.transposeB = true;
  607. Benchmarker<MatrixMul> benchmarker_int(handle());
  608. benchmarker_int.set_times(RUNS)
  609. .set_dtype(0, dtype::Int8{})
  610. .set_dtype(1, dtype::Int8{})
  611. .set_dtype(2, {})
  612. .set_param(param)
  613. .set_display(false);
  614. Benchmarker<MatrixMul> benchmarker_float(handle());
  615. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  616. auto run = [&](size_t M, size_t N, size_t K) {
  617. auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
  618. auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
  619. float computations = 2.f * M * K * N * 1e-6;
  620. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  621. "%f Gflops speedup: %f\n",
  622. M, K, N, float_used, computations / float_used, int_used,
  623. computations / int_used, float_used / int_used);
  624. };
  625. run(256, 12 * 24, 256);
  626. for (size_t M : {8, 64, 112, 256}) {
  627. for (size_t K : {8, 64, 112, 256}) {
  628. for (size_t N : {8, 64, 112, 256}) {
  629. run(M, N, K);
  630. }
  631. }
  632. }
  633. }
  634. TEST_F(AARCH64, BENCHMARK_GEMV_INT_8X8X32) {
  635. constexpr size_t RUNS = 50;
  636. param::MatrixMul param;
  637. Benchmarker<MatrixMul> benchmarker_int(handle());
  638. benchmarker_int.set_times(RUNS)
  639. .set_dtype(0, dtype::Int8{})
  640. .set_dtype(1, dtype::Int8{})
  641. .set_dtype(2, {})
  642. .set_param(param)
  643. .set_display(false);
  644. Benchmarker<MatrixMul> benchmarker_float(handle());
  645. benchmarker_float.set_display(false).set_times(RUNS);
  646. auto run = [&](size_t M, size_t N, size_t K) {
  647. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  648. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  649. float computations = 2.f * M * K * N * 1e-6;
  650. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  651. "%f Gflops speedup: %f\n",
  652. M, K, N, float_used, computations / float_used, int_used,
  653. computations / int_used, float_used / int_used);
  654. };
  655. for (size_t M : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  656. for (size_t N : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  657. for (size_t K : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 256})
  658. run(M, N, K);
  659. }
  660. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT8X8X32_MK4_8X12X4) {
  661. constexpr size_t RUNS = 50;
  662. param::MatrixMul param;
  663. param.transposeA = false;
  664. param.transposeB = false;
  665. Benchmarker<MatrixMul> benchmarker(handle());
  666. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  667. benchmarker.set_times(RUNS)
  668. .set_dtype(0, dtype::Int8{})
  669. .set_dtype(1, dtype::Int8{})
  670. .set_dtype(2, dtype::Int32{})
  671. .set_param(param)
  672. .set_display(false);
  673. benchmarker.set_before_exec_callback(
  674. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_K8X12X4"));
  675. param.format = MatrixMul::Param::Format::MK4_DOT;
  676. benchmarker_mk4.set_before_exec_callback(
  677. AlgoChecker<MatrixMul>("AARCH64_INT8X8X32_MK4_8X12X4_DOTPROD"));
  678. benchmarker_mk4.set_times(RUNS)
  679. .set_dtype(0, dtype::Int8{})
  680. .set_dtype(1, dtype::Int8{})
  681. .set_dtype(2, dtype::Int32{})
  682. .set_param(param)
  683. .set_display(false);
  684. auto run = [&](size_t M, size_t N, size_t K) {
  685. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  686. auto mk_used =
  687. benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
  688. float computations = 2.f * M * K * N * 1e-6;
  689. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  690. "%f Gflops speedup_vs_normal: %f\n",
  691. M, K, N, default_used, computations / default_used, mk_used,
  692. computations / mk_used, default_used / mk_used);
  693. };
  694. run(256, 256, 128);
  695. for (size_t k = 4; k <= 512; k *= 2) {
  696. for (size_t m = 4; m <= 512; m *= 2) {
  697. for (size_t n = 4; n <= 512; n *= 2) {
  698. run(m, n, k);
  699. }
  700. }
  701. std::cout << std::endl;
  702. }
  703. }
  704. #endif // MGB_ENABLE_DOT
  705. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  706. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_F16_MK8) {
  707. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  708. matrix_mul::benchmark_with_contrast(
  709. handle(), args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{},
  710. "AARCH64_F16_MK8_8X8", param::MatrixMul::Format::MK8, dtype::Float16{},
  711. dtype::Float16{}, dtype::Float16{}, "AARCH64_F16_K8X24X1");
  712. }
  713. #endif
  714. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32) {
  715. constexpr size_t RUNS = 50;
  716. Benchmarker<MatrixMul> benchmarker_int(handle());
  717. benchmarker_int.set_times(RUNS)
  718. .set_dtype(0, dtype::Int16{})
  719. .set_dtype(1, dtype::Int16{})
  720. .set_dtype(2, dtype::Int32{})
  721. .set_display(false);
  722. Benchmarker<MatrixMul> benchmarker_float(handle());
  723. benchmarker_float.set_display(false).set_times(RUNS);
  724. auto run = [&](size_t M, size_t N, size_t K, int mask) {
  725. param::MatrixMul param;
  726. param.transposeA = mask & 0x1;
  727. param.transposeB = mask & 0x2;
  728. benchmarker_int.set_param(param);
  729. benchmarker_float.set_param(param);
  730. TensorShape A, B;
  731. if (param.transposeA) {
  732. A = TensorShape{K, M};
  733. } else {
  734. A = TensorShape{M, K};
  735. }
  736. if (param.transposeB) {
  737. B = TensorShape{N, K};
  738. } else {
  739. B = TensorShape{K, N};
  740. }
  741. auto int_used = benchmarker_int.exec({A, B, {}}) / RUNS;
  742. auto float_used = benchmarker_float.exec({A, B, {}}) / RUNS;
  743. float computations = 2.f * M * K * N * 1e-6;
  744. printf("run: {%zu{M} %zu{K} %zu{N} %d{TA} %d{TB}} "
  745. "float: %f ms %f Gflops int: %f ms "
  746. "%f Gflops speedup: %f\n",
  747. M, K, N, param.transposeA, param.transposeB, float_used,
  748. computations / float_used, int_used, computations / int_used,
  749. float_used / int_used);
  750. };
  751. constexpr int mask = 4;
  752. for (auto i = 0; i < mask; i++) {
  753. for (size_t M : {8, 64, 112, 256}) {
  754. for (size_t K : {8, 64, 112, 256}) {
  755. for (size_t N : {8, 64, 112, 256}) {
  756. run(M, N, K, i);
  757. }
  758. }
  759. }
  760. }
  761. }
  762. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_MK4) {
  763. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
  764. matrix_mul::benchmark_with_contrast(
  765. handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
  766. "AARCH64_F32_MK4_4x16", param::MatrixMul::Format::MK4, dtype::Float32{},
  767. dtype::Float32{}, dtype::Float32{});
  768. }
  769. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_PACK_MK4) {
  770. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(16);
  771. matrix_mul::benchmark_with_contrast(
  772. handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
  773. "AARCH64_F32_MK4_K8X12X1", param::MatrixMul::Format::MK4, dtype::Float32{},
  774. dtype::Float32{}, dtype::Float32{}, "AARCH64_F32K8X12X1");
  775. }
  776. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
  777. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  778. matrix_mul::benchmark_with_contrast(
  779. handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  780. "AARCH64_INT16X16X32_MK8_8X8", param::MatrixMul::Format::MK8,
  781. dtype::Int16{}, dtype::Int16{}, dtype::Int32{});
  782. }
  783. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12) {
  784. constexpr size_t RUNS = 50;
  785. param::MatrixMul param;
  786. param.transposeA = param.transposeB = true;
  787. Benchmarker<MatrixMul> benchmarker_k12x8(handle());
  788. Benchmarker<MatrixMul> benchmarker_k8x12(handle());
  789. benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
  790. benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
  791. benchmarker_k12x8.set_before_exec_callback(
  792. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  793. benchmarker_k8x12.set_before_exec_callback(
  794. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  795. auto run = [&](size_t M, size_t N, size_t K) {
  796. auto k12x8_used = benchmarker_k12x8.exec({{K, M}, {N, K}, {}}) / RUNS;
  797. auto k8x12_used = benchmarker_k8x12.exec({{K, M}, {N, K}, {}}) / RUNS;
  798. float computations = 2.f * M * K * N * 1e-6;
  799. printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
  800. "k8x12: %f ms "
  801. "%f Gflops speedup: %f\n",
  802. M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
  803. computations / k8x12_used, k12x8_used / k8x12_used);
  804. };
  805. run(256, 12 * 24, 256);
  806. for (size_t M : {8, 64, 112, 256}) {
  807. for (size_t K : {8, 64, 112, 256}) {
  808. for (size_t N : {8, 64, 112, 256}) {
  809. run(M, N, K);
  810. }
  811. }
  812. }
  813. }
  814. TEST_F(AARCH64, BENCHMARK_MATRIX_MUL_FP32_K8X12_NO_TRANS) {
  815. constexpr size_t RUNS = 50;
  816. param::MatrixMul param;
  817. param.transposeA = param.transposeB = false;
  818. Benchmarker<MatrixMul> benchmarker_k12x8(handle());
  819. Benchmarker<MatrixMul> benchmarker_k8x12(handle());
  820. benchmarker_k12x8.set_param(param).set_display(false).set_times(RUNS);
  821. benchmarker_k8x12.set_param(param).set_display(false).set_times(RUNS);
  822. benchmarker_k12x8.set_before_exec_callback(
  823. AlgoChecker<MatrixMul>("AARCH64_F32K4X16X1"));
  824. benchmarker_k8x12.set_before_exec_callback(
  825. AlgoChecker<MatrixMul>("AARCH64_F32K8X12X1"));
  826. auto run = [&](size_t M, size_t N, size_t K) {
  827. auto k12x8_used = benchmarker_k12x8.exec({{M, K}, {K, N}, {}}) / RUNS;
  828. auto k8x12_used = benchmarker_k8x12.exec({{M, K}, {K, N}, {}}) / RUNS;
  829. float computations = 2.f * M * K * N * 1e-6;
  830. printf("run: {%zu{M} %zu{K} %zu{N}} float k12x8: %f ms %f Gflops "
  831. "k8x12: %f ms "
  832. "%f Gflops speedup: %f\n",
  833. M, K, N, k12x8_used, computations / k12x8_used, k8x12_used,
  834. computations / k8x12_used, k12x8_used / k8x12_used);
  835. };
  836. run(256, 12 * 24, 256);
  837. for (size_t M : {8, 64, 112, 256}) {
  838. for (size_t K : {8, 64, 112, 256}) {
  839. for (size_t N : {8, 64, 112, 256}) {
  840. run(M, N, K);
  841. }
  842. }
  843. }
  844. }
  845. #endif // MEGDNN_WITH_BENCHMARK
  846. // vim: syntax=cpp.doxygen