You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642
  1. /**
  2. * \file dnn/test/arm_common/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/arm_common/fixture.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/matrix_mul.h"
  16. #include "test/common/rng.h"
  17. #include "test/common/task_record_check.h"
  18. #if MGB_ENABLE_CPUINFO
  19. #include "cpuinfo.h"
  20. #endif
  21. using namespace megdnn;
  22. using namespace test;
  23. TEST_F(ARM_COMMON, MATRIX_MUL_INT8x8x32) {
  24. matrix_mul::check_matrix_mul(
  25. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle());
  26. }
  27. TEST_F(ARM_COMMON, MATRIX_MUL_INT8x8x16) {
  28. matrix_mul::check_matrix_mul(
  29. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle());
  30. }
  31. TEST_F(ARM_COMMON, MATRIX_MUL_QUINT8) {
  32. matrix_mul::check_matrix_mul(
  33. dtype::Quantized8Asymm(1.2f, (uint8_t)127),
  34. dtype::Quantized8Asymm(1.3f, (uint8_t)129), {}, handle());
  35. }
  36. TEST_F(ARM_COMMON, MATRIX_MUL_FP32) {
  37. Checker<MatrixMul> checker(handle());
  38. using Param = MatrixMul::Param;
  39. auto run = [&](size_t M, size_t K, size_t N) {
  40. Param param;
  41. param.transposeA = false;
  42. param.transposeB = false;
  43. TensorShape A, B;
  44. A = TensorShape{M, K};
  45. B = TensorShape{K, N};
  46. checker.set_param(param)
  47. .set_dtype(0, dtype::Float32())
  48. .set_dtype(1, dtype::Float32())
  49. .set_dtype(2, dtype::Float32())
  50. .execs({A, B, {}});
  51. };
  52. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  53. // M < 8
  54. for (size_t M : {1, 2, 3, 4, 5, 6, 7})
  55. for (size_t K : {7, 1024, 2048})
  56. for (size_t N : {7, 1024, 2056})
  57. run(M, K, N);
  58. // M = 8,K = 1, 2
  59. for (size_t M : {8})
  60. for (size_t K : {1, 2})
  61. for (size_t N : {7, 1024, 2056})
  62. run(M, K, N);
  63. // N = 1
  64. for (size_t M : {1, 2, 3, 4, 5, 6, 7})
  65. for (size_t K : {7, 1024, 2048})
  66. for (size_t N : {1})
  67. run(M, K, N);
  68. }
  69. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  70. TEST_F(ARM_COMMON, MATRIX_MUL_FP16) {
  71. Checker<MatrixMul> checker(handle());
  72. checker.set_epsilon(1e-2);
  73. NormalRNG rng(2.f);
  74. checker.set_rng(0, &rng).set_rng(1, &rng);
  75. using Param = MatrixMul::Param;
  76. auto args = matrix_mul::get_matmul_args_no_mask();
  77. for (auto& arg : args) {
  78. size_t m = arg.m, n = arg.n, k = arg.k;
  79. Param param;
  80. param.transposeA = false;
  81. param.transposeB = false;
  82. TensorShape A, B;
  83. A = TensorShape{m, k};
  84. B = TensorShape{k, n};
  85. checker.set_param(param)
  86. .set_dtype(0, dtype::Float16())
  87. .set_dtype(1, dtype::Float16())
  88. .set_dtype(2, dtype::Float16())
  89. .execs({A, B, {}});
  90. }
  91. }
  92. TEST_F(ARM_COMMON, MATRIX_MUL_FP16_TEST) {
  93. Checker<MatrixMul> checker(handle());
  94. using Param = MatrixMul::Param;
  95. checker.set_epsilon(1e-2);
  96. NormalRNG rng(2.f);
  97. checker.set_rng(0, &rng).set_rng(1, &rng);
  98. auto run = [&](size_t M, size_t K, size_t N) {
  99. Param param;
  100. param.transposeA = false;
  101. param.transposeB = false;
  102. TensorShape A, B;
  103. A = TensorShape{M, K};
  104. B = TensorShape{K, N};
  105. checker.set_param(param)
  106. .set_dtype(0, dtype::Float16())
  107. .set_dtype(1, dtype::Float16())
  108. .set_dtype(2, dtype::Float16())
  109. .execs({A, B, {}});
  110. };
  111. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_F16_GEMV"));
  112. // M = 1, 2, 3, 4
  113. for (size_t M : {1, 2, 3, 4})
  114. for (size_t K : {7, 512, 1024})
  115. for (size_t N : {13, 1024, 2048})
  116. run(M, K, N);
  117. // N = 1
  118. for (size_t M : {1, 2, 3, 4})
  119. for (size_t K : {7, 512, 1024})
  120. for (size_t N : {1})
  121. run(M, K, N);
  122. }
  123. #endif
  124. TEST_F(ARM_COMMON, QINT8x8x32_GEMV) {
  125. Checker<MatrixMul> checker(handle());
  126. using Param = MatrixMul::Param;
  127. checker.set_before_exec_callback(
  128. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV"));
  129. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  130. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  131. auto run = [&](size_t M, size_t K, size_t N) {
  132. Param param;
  133. param.transposeA = false;
  134. param.transposeB = false;
  135. TensorShape A, B;
  136. A = TensorShape{M, K};
  137. B = TensorShape{K, N};
  138. checker.set_param(param)
  139. .set_dtype(0, dtype::QuantizedS8(2.5f))
  140. .set_dtype(1, dtype::QuantizedS8(2.5f))
  141. .set_dtype(2, dtype::QuantizedS32(6.25f))
  142. .execs({A, B, {}});
  143. };
  144. // N = 1
  145. for (size_t M : {1, 10, 16, 33, 64})
  146. for (size_t K : {7, 512, 1024})
  147. for (size_t N : {1})
  148. run(M, K, N);
  149. }
  150. TEST_F(ARM_COMMON, QINT8x8x32_GEMV_MK4) {
  151. Checker<MatrixMul> checker(handle());
  152. using Param = MatrixMul::Param;
  153. checker.set_before_exec_callback(
  154. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV_MK4"));
  155. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  156. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  157. auto run = [&](size_t M, size_t K, size_t N) {
  158. MEGDNN_MARK_USED_VAR(N);
  159. Param param;
  160. param.format = param::MatrixMul::Format::MK4;
  161. param.transposeA = false;
  162. param.transposeB = false;
  163. TensorShape A, B;
  164. A = TensorShape{M / 4, K / 4, 4, 4};
  165. B = TensorShape{K / 4, 1, 4};
  166. checker.set_param(param)
  167. .set_dtype(0, dtype::QuantizedS8(2.5f))
  168. .set_dtype(1, dtype::QuantizedS8(2.5f))
  169. .set_dtype(2, dtype::QuantizedS32(6.25f))
  170. .execs({A, B, {}});
  171. };
  172. // N = 1
  173. for (size_t M : {4, 16, 128, 1024})
  174. for (size_t K : {4, 8, 12, 16, 20, 24, 256, 1024})
  175. run(M, K, 1);
  176. }
  177. #if MGB_ENABLE_DOT
  178. TEST_F(ARM_COMMON, QINT8x8x32_GEMV_MK4_DOT) {
  179. Checker<MatrixMul> checker(handle());
  180. using Param = MatrixMul::Param;
  181. checker.set_before_exec_callback(
  182. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV_MK4_DOT"));
  183. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  184. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  185. auto run = [&](size_t M, size_t K, size_t N) {
  186. Param param;
  187. param.format = param::MatrixMul::Format::MK4_DOT;
  188. param.transposeA = false;
  189. param.transposeB = false;
  190. TensorShape A, B;
  191. A = TensorShape{M / 4, K / 4, 4, 4};
  192. B = TensorShape{K / 4, 1, 4};
  193. checker.set_param(param)
  194. .set_dtype(0, dtype::QuantizedS8(2.5f))
  195. .set_dtype(1, dtype::QuantizedS8(2.5f))
  196. .set_dtype(2, dtype::QuantizedS32(6.25f))
  197. .execs({A, B, {}});
  198. };
  199. // N = 1
  200. for (size_t M : {4, 16, 128, 1024})
  201. for (size_t K : {4, 8, 12, 16, 20, 24, 256, 1024})
  202. run(M, K, 1);
  203. }
  204. #endif
  205. TEST_F(ARM_COMMON, QINT8x8x32_GEVM) {
  206. Checker<MatrixMul> checker(handle());
  207. using Param = MatrixMul::Param;
  208. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_GEVM"));
  209. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  210. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  211. auto run = [&](size_t M, size_t K, size_t N) {
  212. Param param;
  213. param.transposeA = false;
  214. param.transposeB = true;
  215. TensorShape A, B;
  216. A = TensorShape{M, K};
  217. B = TensorShape{N, K};
  218. checker.set_param(param)
  219. .set_dtype(0, dtype::QuantizedS8(2.5f))
  220. .set_dtype(1, dtype::QuantizedS8(2.5f))
  221. .set_dtype(2, dtype::QuantizedS32(6.25f))
  222. .execs({A, B, {}});
  223. };
  224. // M = 1
  225. for (size_t N : {1, 10, 16, 33, 64})
  226. for (size_t K : {7, 512, 1024})
  227. for (size_t M : {1})
  228. run(M, K, N);
  229. }
  230. TEST_F(ARM_COMMON, FP32_GEVM) {
  231. Checker<MatrixMul> checker(handle());
  232. using Param = MatrixMul::Param;
  233. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_GEVM"));
  234. checker.set_epsilon(1e-2);
  235. auto run = [&](size_t M, size_t K, size_t N) {
  236. Param param;
  237. param.transposeA = false;
  238. param.transposeB = true;
  239. TensorShape A, B;
  240. A = TensorShape{M, K};
  241. B = TensorShape{N, K};
  242. checker.set_param(param).execs({A, B, {}});
  243. };
  244. // M = 1
  245. for (size_t M : {1})
  246. for (size_t K : {1000, 4096})
  247. for (size_t N : {1000, 4096})
  248. run(M, K, N);
  249. }
  250. TEST_F(ARM_COMMON, FP32_GEMV_MK4) {
  251. Checker<MatrixMul> checker(handle());
  252. using Param = MatrixMul::Param;
  253. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV_MK4"));
  254. checker.set_epsilon(1e-2);
  255. auto run = [&](size_t M, size_t K) {
  256. Param param;
  257. param.format = param::MatrixMul::Format::MK4;
  258. param.transposeA = false;
  259. param.transposeB = false;
  260. TensorShape A, B;
  261. A = TensorShape{M / 4, K / 4, 4, 4};
  262. B = TensorShape{K / 4, 1, 4};
  263. checker.set_param(param).execs({A, B, {}});
  264. };
  265. // N = 1
  266. for (size_t M : {4, 16, 128, 1024})
  267. for (size_t K : {4, 8, 12, 128, 256, 4096})
  268. run(M, K);
  269. }
  270. TEST_F(ARM_COMMON, MATRIX_MUL_RECORD) {
  271. TaskRecordChecker<MatrixMul> checker(0);
  272. checker.set_epsilon(1e-2);
  273. NormalRNG rng(2.f);
  274. checker.set_rng(0, &rng).set_rng(1, &rng);
  275. using Param = MatrixMul::Param;
  276. auto args = matrix_mul::get_matmul_args_no_mask();
  277. for (auto& arg : args) {
  278. size_t m = arg.m, n = arg.n, k = arg.k;
  279. Param param;
  280. param.transposeA = false;
  281. param.transposeB = false;
  282. TensorShape A, B;
  283. A = TensorShape{m, k};
  284. B = TensorShape{k, n};
  285. checker.set_param(param)
  286. .set_dtype(0, dtype::Float32())
  287. .set_dtype(1, dtype::Float32())
  288. .set_dtype(2, dtype::Float32())
  289. .execs({A, B, {}});
  290. }
  291. }
  292. #if MEGDNN_WITH_BENCHMARK
  293. TEST_F(ARM_COMMON, BENCHMARK_SGEMV) {
  294. int exec_times = 10;
  295. Benchmarker<MatrixMul> benchmarker(handle());
  296. benchmarker.set_times(exec_times);
  297. auto run = [&](size_t M, size_t K, size_t N) {
  298. printf("SGEMV: (%zu, %zu, %zu)\n", M, K, N);
  299. benchmarker.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  300. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  301. auto computations = 2.f * M * K * N * 1e-6;
  302. auto perf = computations / time;
  303. printf("gemv fp32, Performance is %f Gflops\n", perf);
  304. };
  305. printf("warm up:\n");
  306. for (int i = 0; i < 50; i++) {
  307. benchmarker.set_dtype(0, dtype::Float32())
  308. .set_dtype(1, dtype::Float32())
  309. .set_display(false)
  310. .exec({{2, 1024}, {1024, 512}, {}});
  311. benchmarker.set_display(true);
  312. }
  313. // run gemv
  314. for (size_t M : {1, 2, 4, 8})
  315. for (size_t K : {1024, 1536, 2048})
  316. for (size_t N : {512, 1024})
  317. run(M, K, N);
  318. for (size_t M : {4, 64, 1024, 4096})
  319. for (size_t K : {128, 256, 1024, 4096})
  320. run(M, K, 1);
  321. }
  322. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP32) {
  323. int exec_times = 50;
  324. Benchmarker<MatrixMul> benchmarker(handle());
  325. benchmarker.set_times(exec_times);
  326. benchmarker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  327. auto run = [&](size_t M, size_t K, size_t N) {
  328. printf("SGEMV: (%zu, %zu, %zu)\n", M, K, N);
  329. benchmarker.set_dtype(0, dtype::Float32())
  330. .set_dtype(1, dtype::Float32())
  331. .set_dtype(2, dtype::Float32());
  332. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  333. auto computations = 2 * M * K * N * 1e-6;
  334. auto perf = computations / time;
  335. printf("gemv fp32, Performance is %f Gflops\n", perf);
  336. };
  337. printf("warm up:\n");
  338. for (int i = 0; i < 50; i++) {
  339. benchmarker.set_dtype(0, dtype::Float32())
  340. .set_dtype(1, dtype::Float32())
  341. .set_display(false)
  342. .exec({{2, 1024}, {1024, 512}, {}});
  343. benchmarker.set_display(true);
  344. }
  345. // run gemv
  346. run(12, 48, 1);
  347. run(48, 12, 1);
  348. run(32, 128, 1);
  349. run(128, 32, 1);
  350. run(64, 256, 1);
  351. run(256, 64, 1);
  352. run(128, 512, 1);
  353. run(512, 128, 1);
  354. run(256, 1024, 1);
  355. run(1024, 256, 1);
  356. }
  357. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_MK4) {
  358. int exec_times = 10;
  359. using Param = MatrixMul::Param;
  360. Param param;
  361. param.format = param::MatrixMul::Format::MK4;
  362. param.transposeA = false;
  363. param.transposeB = false;
  364. Benchmarker<MatrixMul> benchmarker(handle());
  365. benchmarker.set_times(exec_times);
  366. benchmarker.set_dtype(0, dtype::Float32())
  367. .set_dtype(1, dtype::Float32())
  368. .set_param(param);
  369. auto run = [&](size_t M, size_t K) {
  370. printf("SGEMV_MK4: (%zu, %zu, 1)\n", M, K);
  371. TensorShape A, B;
  372. A = TensorShape{M / 4, K / 4, 4, 4};
  373. B = TensorShape{K / 4, 1, 4};
  374. auto time = benchmarker.exec({A, B, {}}) / exec_times;
  375. auto computations = 2.f * M * K * 1e-6;
  376. auto perf = computations / time;
  377. printf("gemv mk4 fp32, Performance is %f Gflops\n", perf);
  378. };
  379. printf("warm up:\n");
  380. for (int i = 0; i < 50; i++) {
  381. benchmarker.set_dtype(0, dtype::Float32())
  382. .set_dtype(1, dtype::Float32())
  383. .set_dtype(2, dtype::Float32())
  384. .set_display(false)
  385. .exec({{4, 256, 4, 4}, {256, 1, 4}, {}});
  386. }
  387. // run gemv mk4
  388. for (size_t M : {4, 64, 1024, 4096})
  389. for (size_t K : {128, 1024, 4096})
  390. run(M, K);
  391. }
  392. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP16) {
  393. int exec_times = 50;
  394. Benchmarker<MatrixMul> benchmarker(handle());
  395. benchmarker.set_times(exec_times);
  396. benchmarker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_F16_GEMV"));
  397. auto run = [&](size_t M, size_t K, size_t N) {
  398. printf("SGEMV_FP16: (%zu, %zu, %zu)\n", M, K, N);
  399. benchmarker.set_dtype(0, dtype::Float16())
  400. .set_dtype(1, dtype::Float16())
  401. .set_dtype(2, dtype::Float16());
  402. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  403. auto computations = 2 * M * K * N * 1e-6;
  404. auto perf = computations / time;
  405. printf("gemv fp16, Performance is %f Gflops\n", perf);
  406. };
  407. printf("warm up:\n");
  408. for (int i = 0; i < 50; i++) {
  409. benchmarker.set_dtype(0, dtype::Float16())
  410. .set_dtype(1, dtype::Float16())
  411. .set_dtype(2, dtype::Float16())
  412. .set_display(false)
  413. .exec({{2, 1024}, {1024, 512}, {}});
  414. benchmarker.set_display(true);
  415. }
  416. // run gemv
  417. for (size_t M : {1, 2, 3, 4})
  418. for (size_t K : {1024, 1536, 2048})
  419. for (size_t N : {512, 1024})
  420. run(M, K, N);
  421. }
  422. TEST_F(ARM_COMMON, BENCHMARK_SGEMM) {
  423. int exec_times = 10;
  424. Benchmarker<MatrixMul> benchmarker(handle());
  425. benchmarker.set_times(exec_times);
  426. float mod = 1000 * exec_times / 1e9;
  427. auto run = [&](size_t M, size_t K, size_t N) {
  428. float time = 1.f, perf = 1.f;
  429. printf("SGEMM: (%zu, %zu, %zu)\n", M, K, N);
  430. benchmarker.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  431. time = benchmarker.exec({{M, K}, {K, N}, {}});
  432. perf = 2.f * M * K * N / time * mod;
  433. printf("gemm, Performance is %f Gflops\n", perf);
  434. };
  435. printf("warm up:\n");
  436. for (int i = 0; i < 50; i++) {
  437. benchmarker.set_dtype(0, dtype::Float32())
  438. .set_dtype(1, dtype::Float32())
  439. .set_display(false)
  440. .exec({{2, 1024}, {1024, 512}, {}});
  441. benchmarker.set_display(true);
  442. }
  443. run(256, 12 * 24, 256);
  444. //////////////////////// gemv //////////////////////////
  445. for (size_t M : {8, 64, 112, 256}) {
  446. for (size_t K : {8, 64, 112, 256}) {
  447. run(M, 1, K);
  448. }
  449. }
  450. //////////////////////// gemm //////////////////////////
  451. for (size_t M : {8, 64, 112, 256}) {
  452. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  453. for (size_t N : {8, 64, 112, 256}) {
  454. run(M, N, K);
  455. }
  456. }
  457. }
  458. }
  459. TEST_F(ARM_COMMON, BENCHMARK_MATRIX_MUL_INT8x8x32) {
  460. constexpr size_t RUNS = 50;
  461. param::MatrixMul param;
  462. Benchmarker<MatrixMul> benchmarker_int(handle());
  463. benchmarker_int.set_times(RUNS)
  464. .set_dtype(0, dtype::Int8{})
  465. .set_dtype(1, dtype::Int8{})
  466. .set_dtype(2, dtype::Int32{})
  467. .set_param(param)
  468. .set_display(false);
  469. Benchmarker<MatrixMul> benchmarker_float(handle());
  470. benchmarker_float.set_display(false).set_times(RUNS);
  471. auto run = [&](size_t M, size_t N, size_t K) {
  472. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  473. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  474. float computations = 2.f * M * K * N * 1e-6;
  475. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  476. "%f Gflops speedup: %f\n",
  477. M, K, N, float_used, computations / float_used, int_used,
  478. computations / int_used, float_used / int_used);
  479. };
  480. run(256, 12 * 24, 256);
  481. //////////////////////// gemv //////////////////////////
  482. for (size_t M : {8, 64, 112, 256}) {
  483. for (size_t K : {8, 64, 112, 256}) {
  484. run(M, 1, K);
  485. }
  486. }
  487. //////////////////////// gemm //////////////////////////
  488. for (size_t M : {8, 64, 112, 256}) {
  489. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  490. for (size_t N : {8, 64, 112, 256}) {
  491. run(M, N, K);
  492. }
  493. }
  494. }
  495. }
  496. TEST_F(ARM_COMMON, BENCHMARK_MATRIX_MUL_QUINT8) {
  497. constexpr size_t RUNS = 50;
  498. param::MatrixMul param;
  499. Benchmarker<MatrixMul> benchmarker_int(handle());
  500. benchmarker_int.set_times(RUNS)
  501. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  502. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  503. .set_dtype(2, {})
  504. .set_param(param)
  505. .set_display(false);
  506. Benchmarker<MatrixMul> benchmarker_float(handle());
  507. benchmarker_float.set_display(false).set_times(RUNS);
  508. auto run = [&](size_t M, size_t N, size_t K) {
  509. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  510. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  511. float computations = 2.f * M * K * N * 1e-6;
  512. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  513. "%f Gflops speedup: %f\n",
  514. M, K, N, float_used, computations / float_used, int_used,
  515. computations / int_used, float_used / int_used);
  516. };
  517. run(256, 12 * 24, 256);
  518. for (size_t M : {8, 64, 112, 256}) {
  519. for (size_t K : {8, 64, 112, 256}) {
  520. for (size_t N : {8, 64, 112, 256}) {
  521. run(M, N, K);
  522. }
  523. }
  524. }
  525. }
  526. TEST_F(ARM_COMMON, BENCHMARK_TRANSPOSED_MATRIX_MUL_QUINT8) {
  527. constexpr size_t RUNS = 50;
  528. param::MatrixMul param;
  529. param.transposeA = param.transposeB = true;
  530. Benchmarker<MatrixMul> benchmarker_int(handle());
  531. benchmarker_int.set_times(RUNS)
  532. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  533. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  534. .set_dtype(2, {})
  535. .set_param(param)
  536. .set_display(false);
  537. Benchmarker<MatrixMul> benchmarker_float(handle());
  538. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  539. auto run = [&](size_t M, size_t N, size_t K) {
  540. auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
  541. auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
  542. float computations = 2.f * M * K * N * 1e-6;
  543. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  544. "%f Gflops speedup: %f\n",
  545. M, K, N, float_used, computations / float_used, int_used,
  546. computations / int_used, float_used / int_used);
  547. };
  548. run(256, 12 * 24, 256);
  549. for (size_t M : {8, 64, 112, 256}) {
  550. for (size_t K : {8, 64, 112, 256}) {
  551. for (size_t N : {8, 64, 112, 256}) {
  552. run(M, N, K);
  553. }
  554. }
  555. }
  556. }
  557. #endif
  558. // vim: syntax=cpp.doxygen