You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. #include "test/common/matrix_mul.h"
  2. #include "test/armv7/fixture.h"
  3. #include "test/common/benchmarker.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/rng.h"
  6. using namespace megdnn;
  7. using namespace test;
  8. TEST_F(ARMV7, MATRIX_MUL) {
  9. matrix_mul::check_matrix_mul(
  10. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  11. "ARMV7_F32");
  12. }
  13. TEST_F(ARMV7, MATRIX_MUL_MK4) {
  14. matrix_mul::check_matrix_mul(
  15. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  16. "ARMV7_F32_MK4_4x8", param::MatrixMul::Format::MK4, 1);
  17. }
  18. TEST_F(ARMV7, MATRIX_MUL_PACK_MK4) {
  19. matrix_mul::check_matrix_mul(
  20. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  21. "ARMV7_F32_MK4_PACK_4X12", param::MatrixMul::Format::MK4, 1);
  22. }
  23. TEST_F(ARMV7, MATRIX_MUL_MK4_INT8) {
  24. std::vector<matrix_mul::TestArg> args;
  25. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  26. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  27. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  28. args.emplace_back(m, n, k, 0);
  29. matrix_mul::check_matrix_mul(
  30. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  31. "ARMV7_INT8X8X32_MK4_4X2X16", param::MatrixMul::Format::MK4, 1, 1e-3,
  32. std::move(args));
  33. }
  34. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_K4x8x8) {
  35. matrix_mul::check_matrix_mul(
  36. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  37. "ARMV7_INT8X8X16_K4X8X8");
  38. }
  39. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_K8x8x4) {
  40. matrix_mul::check_matrix_mul(
  41. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  42. "ARMV7_INT8X8X16_K8X8X4");
  43. }
  44. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_MK4_K8x8x4) {
  45. matrix_mul::check_matrix_mul(
  46. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  47. "ARMV7_INT8X8X16_MK4_K8X8X4", param::MatrixMul::Format::MK4, 1);
  48. }
  49. TEST_F(ARMV7, MATRIX_MUL_INT16x16x32) {
  50. matrix_mul::check_matrix_mul(
  51. dtype::Int16{}, dtype::Int16{}, dtype::Int32{}, handle(),
  52. "ARMV7_INT16X16X32_K12X4X1");
  53. }
  54. TEST_F(ARMV7, MATRIX_MUL_INT16x16x32_MK8) {
  55. matrix_mul::check_matrix_mul(
  56. dtype::Int16{}, dtype::Int16{}, dtype::Int32{}, handle(),
  57. "ARMV7_INT16X16X32_MK8_4X8", param::MatrixMul::Format::MK8, 1);
  58. }
  59. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  60. TEST_F(ARMV7, MATRIX_MUL_FP16) {
  61. matrix_mul::check_matrix_mul(
  62. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  63. "AARCH32_F16_K4X16X1");
  64. }
  65. TEST_F(ARMV7, MATRIX_MUL_F16_MK8) {
  66. matrix_mul::check_matrix_mul(
  67. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  68. "AARCH32_F16_MK8_4X8", param::MatrixMul::Format::MK8, 1);
  69. }
  70. #endif
  71. #if MGB_ENABLE_DOT
  72. TEST_F(ARMV7, MATRIX_MUL_SDOT) {
  73. matrix_mul::check_matrix_mul(
  74. dtype::Int8(), dtype::Int8(), dtype::Int32(), handle(),
  75. "AARCH32_INT8_K6X8X4");
  76. }
  77. TEST_F(ARMV7, MATRIX_MUL_UDOT) {
  78. matrix_mul::check_matrix_mul(
  79. dtype::Quantized8Asymm(4.0f, static_cast<uint8_t>(10)),
  80. dtype::Quantized8Asymm(3.0f, static_cast<uint8_t>(54)),
  81. dtype::QuantizedS32(12.0f), handle(), "AARCH32_QUINT8_K4X8X4");
  82. }
  83. TEST_F(ARMV7, MATRIX_MUL_MK4_DOT_INT8) {
  84. std::vector<matrix_mul::TestArg> args;
  85. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  86. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  87. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  88. args.emplace_back(m, n, k, 0);
  89. matrix_mul::check_matrix_mul(
  90. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  91. "AARCH32_INT8_MK4_8X4X4_DOTPROD", param::MatrixMul::Format::MK4_DOT, 1,
  92. 1e-3, std::move(args));
  93. }
  94. #endif
  95. #if MEGDNN_WITH_BENCHMARK
  96. namespace {
  97. void run_8x8x16_benchmark(
  98. const char* algo, Handle* handle,
  99. MatrixMul::Param::Format format = MatrixMul::Param::Format::DEFAULT) {
  100. constexpr size_t RUNS = 50;
  101. param::MatrixMul param;
  102. Benchmarker<MatrixMul> benchmarker_int(handle);
  103. Benchmarker<MatrixMul> benchmarker_int_kern_4x2x16(handle);
  104. benchmarker_int.set_before_exec_callback(
  105. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X16"));
  106. benchmarker_int.set_times(RUNS)
  107. .set_dtype(0, dtype::Int8{})
  108. .set_dtype(1, dtype::Int8{})
  109. .set_dtype(2, dtype::Int16{})
  110. .set_param(param)
  111. .set_display(false);
  112. param::MatrixMul target_param;
  113. target_param.format = format;
  114. benchmarker_int_kern_4x2x16.set_before_exec_callback(AlgoChecker<MatrixMul>(algo));
  115. benchmarker_int_kern_4x2x16.set_times(RUNS)
  116. .set_dtype(0, dtype::Int8{})
  117. .set_dtype(1, dtype::Int8{})
  118. .set_dtype(2, dtype::Int16{})
  119. .set_param(target_param)
  120. .set_display(false);
  121. Benchmarker<MatrixMul> benchmarker_float(handle);
  122. benchmarker_float.set_display(false).set_times(RUNS);
  123. auto run = [&](size_t M, size_t N, size_t K) {
  124. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  125. auto int_kern_used = 1e10;
  126. if (format == MatrixMul::Param::Format::MK4) {
  127. int_kern_used = benchmarker_int_kern_4x2x16.exec(
  128. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  129. RUNS;
  130. } else {
  131. int_kern_used =
  132. benchmarker_int_kern_4x2x16.exec({{M, K}, {K, N}, {}}) / RUNS;
  133. }
  134. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  135. float computations = 2.f * M * K * N * 1e-6;
  136. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f "
  137. "ms "
  138. "%f Gflops %s: %f ms %f Gflops "
  139. "speedup(%s/arm_common, %s/float): %f "
  140. "%f\n",
  141. M, K, N, float_used, computations / float_used, int_used,
  142. computations / int_used, algo, int_kern_used,
  143. computations / int_kern_used, algo, algo, int_used / int_kern_used,
  144. float_used / int_kern_used);
  145. };
  146. run(256, 12 * 24, 256);
  147. run(256, 256, 256);
  148. //////////////////////// gemv //////////////////////////
  149. for (size_t M : {8, 64, 112, 256}) {
  150. for (size_t K : {8, 64, 112, 256}) {
  151. run(M, 1, K);
  152. }
  153. }
  154. //////////////////////// gemm //////////////////////////
  155. for (size_t M : {8, 64, 112, 256}) {
  156. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  157. for (size_t N : {8, 64, 112, 256}) {
  158. run(M, N, K);
  159. }
  160. }
  161. }
  162. }
  163. void run_8x8x16_contrast(
  164. const char* algo0, const char* algo, Handle* handle,
  165. MatrixMul::Param::Format format = MatrixMul::Param::Format::DEFAULT) {
  166. constexpr size_t RUNS = 100;
  167. param::MatrixMul param;
  168. Benchmarker<MatrixMul> benchmarker_int(handle);
  169. Benchmarker<MatrixMul> benchmarker_int_kern_4x2x16(handle);
  170. benchmarker_int.set_before_exec_callback(AlgoChecker<MatrixMul>(algo0));
  171. benchmarker_int.set_times(RUNS)
  172. .set_dtype(0, dtype::Int8{})
  173. .set_dtype(1, dtype::Int8{})
  174. .set_dtype(2, dtype::Int16{})
  175. .set_param(param)
  176. .set_display(false);
  177. param::MatrixMul target_param;
  178. target_param.format = format;
  179. benchmarker_int_kern_4x2x16.set_before_exec_callback(AlgoChecker<MatrixMul>(algo));
  180. benchmarker_int_kern_4x2x16.set_times(RUNS)
  181. .set_dtype(0, dtype::Int8{})
  182. .set_dtype(1, dtype::Int8{})
  183. .set_dtype(2, dtype::Int16{})
  184. .set_param(target_param)
  185. .set_display(false);
  186. auto run = [&](size_t M, size_t N, size_t K) {
  187. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  188. auto int_kern_used = 1e10;
  189. double computation = 2.0f * M * N * K * 1e-6;
  190. if (format == MatrixMul::Param::Format::MK4) {
  191. int_kern_used = benchmarker_int_kern_4x2x16.exec(
  192. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  193. RUNS;
  194. } else {
  195. int_kern_used =
  196. benchmarker_int_kern_4x2x16.exec({{M, K}, {K, N}, {}}) / RUNS;
  197. }
  198. printf(" %f(%f)\t %f(%f)\t %f\n", int_used, computation / int_used,
  199. int_kern_used, computation / int_kern_used, int_used / int_kern_used);
  200. };
  201. printf("\nN\t K\t M\t %s ms(GFlops)\t %s ms(GFlops)\t SPEEDUP\n", algo0, algo);
  202. for (size_t M : {8}) {
  203. for (size_t K : {72}) {
  204. for (size_t N :
  205. {8, 16, 32, 64, 72, 128, 256, 512, 1024, 4096, 8192, 16384, 32768,
  206. 65536}) {
  207. printf("%zu\t %zu\t %zu\t", N, K, M);
  208. run(M, N, K);
  209. }
  210. }
  211. }
  212. printf("512\t 512\t 512\t");
  213. run(512, 512, 512);
  214. }
  215. void run_16x16x32_benchmark(const char* algo, Handle* handle) {
  216. constexpr size_t RUNS = 50;
  217. param::MatrixMul param;
  218. Benchmarker<MatrixMul> benchmarker_int(handle);
  219. benchmarker_int.set_before_exec_callback(
  220. AlgoChecker<MatrixMul>("ARMV7_INT16X16X32_K12X4X1"));
  221. benchmarker_int.set_times(RUNS)
  222. .set_dtype(0, dtype::Int16{})
  223. .set_dtype(1, dtype::Int16{})
  224. .set_dtype(2, dtype::Int32{})
  225. .set_param(param)
  226. .set_display(false);
  227. Benchmarker<MatrixMul> benchmarker_float(handle);
  228. benchmarker_float.set_display(false).set_times(RUNS);
  229. auto run = [&](size_t M, size_t N, size_t K) {
  230. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  231. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  232. float computations = 2.f * M * K * N * 1e-6;
  233. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops \n"
  234. "int: %f ms %f Gflops %s: \n"
  235. "speedup(%s/arm_common, %s/float): %f\n",
  236. M, K, N, float_used, computations / float_used, int_used,
  237. computations / int_used, algo, algo, algo, float_used / int_used);
  238. };
  239. run(256, 12 * 24, 256);
  240. //////////////////////// gemv //////////////////////////
  241. for (size_t M : {8, 64, 112, 256}) {
  242. for (size_t K : {8, 64, 112, 256}) {
  243. run(M, 1, K);
  244. }
  245. }
  246. //////////////////////// gemm //////////////////////////
  247. for (size_t M : {8, 64, 112, 256}) {
  248. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  249. for (size_t N :
  250. {1, 2, 3, 4, 8, 64, 112, 113, 114, 115, 256, 257, 258, 259}) {
  251. run(M, N, K);
  252. }
  253. }
  254. }
  255. }
  256. #if MGB_ENABLE_DOT
  257. void run_8x8x32_benchmark(const char* algo, Handle* handle) {
  258. constexpr size_t RUNS = 50;
  259. param::MatrixMul param;
  260. Benchmarker<MatrixMul> benchmarker_int8(handle);
  261. benchmarker_int8.set_before_exec_callback(AlgoChecker<MatrixMul>(algo));
  262. benchmarker_int8.set_times(RUNS)
  263. .set_dtype(0, dtype::Int8{})
  264. .set_dtype(1, dtype::Int8{})
  265. .set_dtype(2, dtype::Int32{})
  266. .set_param(param)
  267. .set_display(false);
  268. Benchmarker<MatrixMul> benchmarker_float(handle);
  269. benchmarker_float.set_display(false).set_times(RUNS);
  270. auto run = [&](size_t M, size_t N, size_t K) {
  271. auto int_used = benchmarker_int8.exec({{M, K}, {K, N}, {}}) / RUNS;
  272. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  273. float computations = 2.f * M * K * N * 1e-6;
  274. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops \n"
  275. "int: %f ms %f Gflops %s: \n"
  276. "speedup(%s/arm_common, %s/float): %f\n",
  277. M, K, N, float_used, computations / float_used, int_used,
  278. computations / int_used, algo, algo, algo, float_used / int_used);
  279. };
  280. run(256, 12 * 24, 256);
  281. //////////////////////// gemm //////////////////////////
  282. for (size_t M : {8, 64, 112, 256}) {
  283. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  284. for (size_t N : {113, 114, 115, 256, 1024}) {
  285. run(M, N, K);
  286. }
  287. }
  288. }
  289. }
  290. void run_8x8x32_quint_benchmark(Handle* handle) {
  291. constexpr size_t RUNS = 50;
  292. param::MatrixMul param;
  293. Benchmarker<MatrixMul> benchmarker_quint8_dot(handle);
  294. benchmarker_quint8_dot.set_before_exec_callback(
  295. AlgoChecker<MatrixMul>("AARCH32_QUINT8_K4X8X4"));
  296. benchmarker_quint8_dot.set_times(RUNS)
  297. .set_dtype(0, dtype::Quantized8Asymm(2.3f, static_cast<uint8_t>(20)))
  298. .set_dtype(1, dtype::Quantized8Asymm(3.1f, static_cast<uint8_t>(30)))
  299. .set_dtype(2, dtype::QuantizedS32(2.3f * 3.1f))
  300. .set_param(param)
  301. .set_display(false);
  302. Benchmarker<MatrixMul> benchmarker_quint8(handle);
  303. benchmarker_quint8.set_before_exec_callback(
  304. AlgoChecker<MatrixMul>("ARMV7_QUINT8_K4X8X8"));
  305. benchmarker_quint8.set_times(RUNS)
  306. .set_dtype(0, dtype::Quantized8Asymm(2.3f, static_cast<uint8_t>(20)))
  307. .set_dtype(1, dtype::Quantized8Asymm(3.1f, static_cast<uint8_t>(30)))
  308. .set_dtype(2, dtype::QuantizedS32(2.3f * 3.1f))
  309. .set_param(param)
  310. .set_display(false);
  311. auto run = [&](size_t M, size_t N, size_t K) {
  312. auto dot_used = benchmarker_quint8_dot.exec({{M, K}, {K, N}, {}}) / RUNS;
  313. auto normal_used = benchmarker_quint8.exec({{M, K}, {K, N}, {}}) / RUNS;
  314. float computations = 2.f * M * K * N * 1e-6;
  315. printf("run: {%zu{M} %zu{K} %zu{N}} dot: %f ms %f Gflops \n"
  316. "normal: %f ms %f Gflops.speedup: %f\n",
  317. M, K, N, dot_used, computations / dot_used, normal_used,
  318. computations / normal_used, normal_used / dot_used);
  319. };
  320. run(256, 12 * 24, 256);
  321. //////////////////////// gemm //////////////////////////
  322. for (size_t M : {8, 64, 112, 256}) {
  323. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  324. for (size_t N : {113, 114, 115, 256, 1024}) {
  325. run(M, N, K);
  326. }
  327. }
  328. }
  329. }
  330. #endif
  331. } // namespace
  332. #if MGB_ENABLE_DOT
  333. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x32_K6x8x4) {
  334. run_8x8x32_benchmark("AARCH32_INT8_K6X8X4", handle());
  335. }
  336. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_QUINT8x8x32_K4x8x4) {
  337. run_8x8x32_quint_benchmark(handle());
  338. }
  339. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x32_MK4_DOT) {
  340. constexpr size_t RUNS = 50;
  341. param::MatrixMul param;
  342. Benchmarker<MatrixMul> benchmarker_default(handle());
  343. benchmarker_default.set_times(RUNS)
  344. .set_dtype(0, dtype::Int8())
  345. .set_dtype(1, dtype::Int8())
  346. .set_dtype(2, dtype::Int32())
  347. .set_param(param)
  348. .set_display(false);
  349. benchmarker_default.set_before_exec_callback(
  350. AlgoChecker<MatrixMul>("AARCH32_INT8_K6X8X4"));
  351. param.format = MatrixMul::Param::Format::MK4_DOT;
  352. Benchmarker<MatrixMul> benchmarker_mk4_dot(handle());
  353. benchmarker_mk4_dot.set_before_exec_callback(
  354. AlgoChecker<MatrixMul>("AARCH32_INT8_MK4_8X4X4_DOTPROD"));
  355. benchmarker_mk4_dot.set_param(param)
  356. .set_dtype(0, dtype::Int8())
  357. .set_dtype(1, dtype::Int8())
  358. .set_dtype(2, dtype::Int32())
  359. .set_display(false)
  360. .set_times(RUNS);
  361. auto run = [&](size_t M, size_t N, size_t K) {
  362. auto default_used = benchmarker_default.exec({{M, K}, {K, N}, {}}) / RUNS;
  363. auto mk4_dot_used =
  364. benchmarker_mk4_dot.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  365. RUNS;
  366. float computations = 2.f * M * K * N * 1e-6;
  367. printf("run: {%zu{M} %zu{K} %zu{N}} default: %f ms %f Gflops mk4_dot: "
  368. "%f ms "
  369. "%f Gflops speedup: %f\n",
  370. M, K, N, default_used, computations / default_used, mk4_dot_used,
  371. computations / mk4_dot_used, default_used / mk4_dot_used);
  372. };
  373. for (size_t M = 4; M < 512; M *= 2) {
  374. for (size_t K = 4; K < 512; K *= 2) {
  375. for (size_t N : {4, 8, 33, 113, 128}) {
  376. run(M, N, K);
  377. }
  378. }
  379. }
  380. }
  381. #endif
  382. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x2x16) {
  383. run_8x8x16_benchmark("ARMV7_INT8X8X16_K4X2X16", handle());
  384. }
  385. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8) {
  386. run_8x8x16_benchmark("ARMV7_INT8X8X16_K4X8X8", handle());
  387. }
  388. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K8x8x4) {
  389. run_8x8x16_benchmark("ARMV7_INT8X8X16_K8X8X4", handle());
  390. }
  391. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_MK4_K4x8x8) {
  392. run_8x8x16_benchmark(
  393. "ARMV7_INT8X8X16_MK4_K8X8X4", handle(), MatrixMul::Param::Format::MK4);
  394. }
  395. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT16x16x32_K12x4x1) {
  396. run_16x16x32_benchmark("ARMV7_INT16X16X32_K12X4X1", handle());
  397. }
  398. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K8x8x4_CONTRAST) {
  399. run_8x8x16_contrast("ARM_COMMON_INT8X8X16", "ARMV7_INT8X8X16_K8X8X4", handle());
  400. }
  401. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8_CONTRAST) {
  402. run_8x8x16_contrast("ARM_COMMON_INT8X8X16", "ARMV7_INT8X8X16_K4X8X8", handle());
  403. }
  404. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8_K8x8x4_CONTRAST) {
  405. run_8x8x16_contrast("ARMV7_INT8X8X16_K4X8X8", "ARMV7_INT8X8X16_K8X8X4", handle());
  406. }
  407. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  408. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_FP16) {
  409. constexpr size_t RUNS = 50;
  410. param::MatrixMul param;
  411. Benchmarker<MatrixMul> benchmarker_fp16(handle());
  412. benchmarker_fp16.set_times(RUNS)
  413. .set_dtype(0, dtype::Float16())
  414. .set_dtype(1, dtype::Float16())
  415. .set_dtype(2, dtype::Float16())
  416. .set_param(param)
  417. .set_display(false);
  418. Benchmarker<MatrixMul> benchmarker_float(handle());
  419. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  420. auto run = [&](size_t M, size_t N, size_t K) {
  421. auto fp16_used = benchmarker_fp16.exec({{M, K}, {K, N}, {}}) / RUNS;
  422. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  423. float computations = 2.f * M * K * N * 1e-6;
  424. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops fp16: %f ms "
  425. "%f Gflops speedup: %f\n",
  426. M, K, N, float_used, computations / float_used, fp16_used,
  427. computations / fp16_used, float_used / fp16_used);
  428. };
  429. run(256, 12 * 24, 256);
  430. for (size_t M : {8, 64, 112, 256}) {
  431. for (size_t K : {8, 64, 112, 256}) {
  432. for (size_t N : {8, 64, 112, 256}) {
  433. run(M, N, K);
  434. }
  435. }
  436. }
  437. }
  438. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_F16_MK8) {
  439. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(4);
  440. matrix_mul::benchmark_with_contrast(
  441. handle(), args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{},
  442. "AARCH32_F16_MK8_4X8", param::MatrixMul::Format::MK8, dtype::Float16{},
  443. dtype::Float16{}, dtype::Float16{}, "AARCH32_F16_K4X16X1");
  444. }
  445. #endif
  446. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_MK4) {
  447. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  448. matrix_mul::benchmark_with_contrast(
  449. handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
  450. "ARMV7_F32_MK4_4x8", param::MatrixMul::Format::MK4, dtype::Float32{},
  451. dtype::Float32{}, dtype::Float32{});
  452. }
  453. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_PACK_MK4) {
  454. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  455. matrix_mul::benchmark_with_contrast(
  456. handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
  457. "ARMV7_F32_MK4_PACK_4X12", param::MatrixMul::Format::MK4, dtype::Float32{},
  458. dtype::Float32{}, dtype::Float32{});
  459. }
  460. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
  461. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(4);
  462. matrix_mul::benchmark_with_contrast(
  463. handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  464. "ARMV7_INT16X16X32_MK8_4X8", param::MatrixMul::Format::MK8, dtype::Int16{},
  465. dtype::Int16{}, dtype::Int32{});
  466. }
  467. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT32_MK_4X2X16) {
  468. constexpr size_t RUNS = 50;
  469. param::MatrixMul param;
  470. param.transposeA = false;
  471. param.transposeB = false;
  472. Benchmarker<MatrixMul> benchmarker(handle());
  473. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  474. benchmarker.set_times(RUNS)
  475. .set_dtype(0, dtype::Int8{})
  476. .set_dtype(1, dtype::Int8{})
  477. .set_dtype(2, dtype::Int32{})
  478. .set_param(param)
  479. .set_display(false);
  480. benchmarker.set_before_exec_callback(
  481. AlgoChecker<MatrixMul>("ARMV7_INT8X8X32_K4X2X16"));
  482. param.format = MatrixMul::Param::Format::MK4;
  483. benchmarker_mk4.set_before_exec_callback(
  484. AlgoChecker<MatrixMul>("ARMV7_INT8X8X32_MK4_4X2X16"));
  485. benchmarker_mk4.set_times(RUNS)
  486. .set_dtype(0, dtype::Int8{})
  487. .set_dtype(1, dtype::Int8{})
  488. .set_dtype(2, dtype::Int32{})
  489. .set_param(param)
  490. .set_display(false);
  491. auto run = [&](size_t M, size_t N, size_t K) {
  492. auto mk_used =
  493. benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
  494. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  495. float computations = 2.f * M * K * N * 1e-6;
  496. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  497. "%f Gflops speedup_vs_normal: %f\n",
  498. M, K, N, default_used, computations / default_used, mk_used,
  499. computations / mk_used, default_used / mk_used);
  500. };
  501. run(256, 256, 128);
  502. for (size_t k = 4; k <= 512; k *= 2) {
  503. for (size_t m = 4; m <= 512; m *= 2) {
  504. for (size_t n = 4; n <= 512; n *= 2) {
  505. run(m, n, k);
  506. }
  507. }
  508. std::cout << std::endl;
  509. }
  510. }
  511. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_ARMV7_F32) {
  512. auto args = matrix_mul::get_benchmark_matmul_args();
  513. matrix_mul::benchmark_single_algo(
  514. handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
  515. "ARMV7_F32", param::MatrixMul::Format::DEFAULT);
  516. }
  517. #endif
  518. // vim: syntax=cpp.doxygen