You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587
  1. /**
  2. * \file dnn/test/armv7/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/common/matrix_mul.h"
  13. #include "test/armv7/fixture.h"
  14. #include "test/common/benchmarker.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/rng.h"
  17. using namespace megdnn;
  18. using namespace test;
  19. TEST_F(ARMV7, MATRIX_MUL) {
  20. matrix_mul::check_matrix_mul(
  21. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  22. "ARMV7_F32");
  23. }
  24. TEST_F(ARMV7, MATRIX_MUL_MK4) {
  25. matrix_mul::check_matrix_mul(
  26. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  27. "ARMV7_F32_MK4_4x8", param::MatrixMul::Format::MK4, 1);
  28. }
  29. TEST_F(ARMV7, MATRIX_MUL_PACK_MK4) {
  30. matrix_mul::check_matrix_mul(
  31. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  32. "ARMV7_F32_MK4_PACK_4X12", param::MatrixMul::Format::MK4, 1);
  33. }
  34. TEST_F(ARMV7, MATRIX_MUL_MK4_INT8) {
  35. std::vector<matrix_mul::TestArg> args;
  36. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  37. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  38. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  39. args.emplace_back(m, n, k, 0);
  40. matrix_mul::check_matrix_mul(
  41. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  42. "ARMV7_INT8X8X32_MK4_4X2X16", param::MatrixMul::Format::MK4, 1, 1e-3,
  43. std::move(args));
  44. }
  45. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_K4x8x8) {
  46. matrix_mul::check_matrix_mul(
  47. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  48. "ARMV7_INT8X8X16_K4X8X8");
  49. }
  50. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_K8x8x4) {
  51. matrix_mul::check_matrix_mul(
  52. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  53. "ARMV7_INT8X8X16_K8X8X4");
  54. }
  55. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_MK4_K8x8x4) {
  56. matrix_mul::check_matrix_mul(
  57. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle(),
  58. "ARMV7_INT8X8X16_MK4_K8X8X4", param::MatrixMul::Format::MK4, 1);
  59. }
  60. TEST_F(ARMV7, MATRIX_MUL_INT16x16x32) {
  61. matrix_mul::check_matrix_mul(
  62. dtype::Int16{}, dtype::Int16{}, dtype::Int32{}, handle(),
  63. "ARMV7_INT16X16X32_K12X4X1");
  64. }
  65. TEST_F(ARMV7, MATRIX_MUL_INT16x16x32_MK8) {
  66. matrix_mul::check_matrix_mul(
  67. dtype::Int16{}, dtype::Int16{}, dtype::Int32{}, handle(),
  68. "ARMV7_INT16X16X32_MK8_4X8", param::MatrixMul::Format::MK8, 1);
  69. }
  70. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  71. TEST_F(ARMV7, MATRIX_MUL_FP16) {
  72. matrix_mul::check_matrix_mul(
  73. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  74. "AARCH32_F16_K4X16X1");
  75. }
  76. TEST_F(ARMV7, MATRIX_MUL_F16_MK8) {
  77. matrix_mul::check_matrix_mul(
  78. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  79. "AARCH32_F16_MK8_4X8", param::MatrixMul::Format::MK8, 1);
  80. }
  81. #endif
  82. #if MGB_ENABLE_DOT
  83. TEST_F(ARMV7, MATRIX_MUL_SDOT) {
  84. matrix_mul::check_matrix_mul(
  85. dtype::Int8(), dtype::Int8(), dtype::Int32(), handle(),
  86. "AARCH32_INT8_K6X8X4");
  87. }
  88. TEST_F(ARMV7, MATRIX_MUL_UDOT) {
  89. matrix_mul::check_matrix_mul(
  90. dtype::Quantized8Asymm(4.0f, static_cast<uint8_t>(10)),
  91. dtype::Quantized8Asymm(3.0f, static_cast<uint8_t>(54)),
  92. dtype::QuantizedS32(12.0f), handle(), "AARCH32_QUINT8_K4X8X4");
  93. }
  94. TEST_F(ARMV7, MATRIX_MUL_MK4_DOT_INT8) {
  95. std::vector<matrix_mul::TestArg> args;
  96. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  97. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  98. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  99. args.emplace_back(m, n, k, 0);
  100. matrix_mul::check_matrix_mul(
  101. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle(),
  102. "AARCH32_INT8_MK4_8X4X4_DOTPROD", param::MatrixMul::Format::MK4_DOT, 1,
  103. 1e-3, std::move(args));
  104. }
  105. #endif
  106. #if MEGDNN_WITH_BENCHMARK
  107. namespace {
  108. void run_8x8x16_benchmark(
  109. const char* algo, Handle* handle,
  110. MatrixMul::Param::Format format = MatrixMul::Param::Format::DEFAULT) {
  111. constexpr size_t RUNS = 50;
  112. param::MatrixMul param;
  113. Benchmarker<MatrixMul> benchmarker_int(handle);
  114. Benchmarker<MatrixMul> benchmarker_int_kern_4x2x16(handle);
  115. benchmarker_int.set_before_exec_callback(
  116. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X16"));
  117. benchmarker_int.set_times(RUNS)
  118. .set_dtype(0, dtype::Int8{})
  119. .set_dtype(1, dtype::Int8{})
  120. .set_dtype(2, dtype::Int16{})
  121. .set_param(param)
  122. .set_display(false);
  123. param::MatrixMul target_param;
  124. target_param.format = format;
  125. benchmarker_int_kern_4x2x16.set_before_exec_callback(AlgoChecker<MatrixMul>(algo));
  126. benchmarker_int_kern_4x2x16.set_times(RUNS)
  127. .set_dtype(0, dtype::Int8{})
  128. .set_dtype(1, dtype::Int8{})
  129. .set_dtype(2, dtype::Int16{})
  130. .set_param(target_param)
  131. .set_display(false);
  132. Benchmarker<MatrixMul> benchmarker_float(handle);
  133. benchmarker_float.set_display(false).set_times(RUNS);
  134. auto run = [&](size_t M, size_t N, size_t K) {
  135. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  136. auto int_kern_used = 1e10;
  137. if (format == MatrixMul::Param::Format::MK4) {
  138. int_kern_used = benchmarker_int_kern_4x2x16.exec(
  139. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  140. RUNS;
  141. } else {
  142. int_kern_used =
  143. benchmarker_int_kern_4x2x16.exec({{M, K}, {K, N}, {}}) / RUNS;
  144. }
  145. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  146. float computations = 2.f * M * K * N * 1e-6;
  147. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f "
  148. "ms "
  149. "%f Gflops %s: %f ms %f Gflops "
  150. "speedup(%s/arm_common, %s/float): %f "
  151. "%f\n",
  152. M, K, N, float_used, computations / float_used, int_used,
  153. computations / int_used, algo, int_kern_used,
  154. computations / int_kern_used, algo, algo, int_used / int_kern_used,
  155. float_used / int_kern_used);
  156. };
  157. run(256, 12 * 24, 256);
  158. run(256, 256, 256);
  159. //////////////////////// gemv //////////////////////////
  160. for (size_t M : {8, 64, 112, 256}) {
  161. for (size_t K : {8, 64, 112, 256}) {
  162. run(M, 1, K);
  163. }
  164. }
  165. //////////////////////// gemm //////////////////////////
  166. for (size_t M : {8, 64, 112, 256}) {
  167. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  168. for (size_t N : {8, 64, 112, 256}) {
  169. run(M, N, K);
  170. }
  171. }
  172. }
  173. }
  174. void run_8x8x16_contrast(
  175. const char* algo0, const char* algo, Handle* handle,
  176. MatrixMul::Param::Format format = MatrixMul::Param::Format::DEFAULT) {
  177. constexpr size_t RUNS = 100;
  178. param::MatrixMul param;
  179. Benchmarker<MatrixMul> benchmarker_int(handle);
  180. Benchmarker<MatrixMul> benchmarker_int_kern_4x2x16(handle);
  181. benchmarker_int.set_before_exec_callback(AlgoChecker<MatrixMul>(algo0));
  182. benchmarker_int.set_times(RUNS)
  183. .set_dtype(0, dtype::Int8{})
  184. .set_dtype(1, dtype::Int8{})
  185. .set_dtype(2, dtype::Int16{})
  186. .set_param(param)
  187. .set_display(false);
  188. param::MatrixMul target_param;
  189. target_param.format = format;
  190. benchmarker_int_kern_4x2x16.set_before_exec_callback(AlgoChecker<MatrixMul>(algo));
  191. benchmarker_int_kern_4x2x16.set_times(RUNS)
  192. .set_dtype(0, dtype::Int8{})
  193. .set_dtype(1, dtype::Int8{})
  194. .set_dtype(2, dtype::Int16{})
  195. .set_param(target_param)
  196. .set_display(false);
  197. auto run = [&](size_t M, size_t N, size_t K) {
  198. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  199. auto int_kern_used = 1e10;
  200. double computation = 2.0f * M * N * K * 1e-6;
  201. if (format == MatrixMul::Param::Format::MK4) {
  202. int_kern_used = benchmarker_int_kern_4x2x16.exec(
  203. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  204. RUNS;
  205. } else {
  206. int_kern_used =
  207. benchmarker_int_kern_4x2x16.exec({{M, K}, {K, N}, {}}) / RUNS;
  208. }
  209. printf(" %f(%f)\t %f(%f)\t %f\n", int_used, computation / int_used,
  210. int_kern_used, computation / int_kern_used, int_used / int_kern_used);
  211. };
  212. printf("\nN\t K\t M\t %s ms(GFlops)\t %s ms(GFlops)\t SPEEDUP\n", algo0, algo);
  213. for (size_t M : {8}) {
  214. for (size_t K : {72}) {
  215. for (size_t N :
  216. {8, 16, 32, 64, 72, 128, 256, 512, 1024, 4096, 8192, 16384, 32768,
  217. 65536}) {
  218. printf("%zu\t %zu\t %zu\t", N, K, M);
  219. run(M, N, K);
  220. }
  221. }
  222. }
  223. printf("512\t 512\t 512\t");
  224. run(512, 512, 512);
  225. }
  226. void run_16x16x32_benchmark(const char* algo, Handle* handle) {
  227. constexpr size_t RUNS = 50;
  228. param::MatrixMul param;
  229. Benchmarker<MatrixMul> benchmarker_int(handle);
  230. benchmarker_int.set_before_exec_callback(
  231. AlgoChecker<MatrixMul>("ARMV7_INT16X16X32_K12X4X1"));
  232. benchmarker_int.set_times(RUNS)
  233. .set_dtype(0, dtype::Int16{})
  234. .set_dtype(1, dtype::Int16{})
  235. .set_dtype(2, dtype::Int32{})
  236. .set_param(param)
  237. .set_display(false);
  238. Benchmarker<MatrixMul> benchmarker_float(handle);
  239. benchmarker_float.set_display(false).set_times(RUNS);
  240. auto run = [&](size_t M, size_t N, size_t K) {
  241. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  242. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  243. float computations = 2.f * M * K * N * 1e-6;
  244. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops \n"
  245. "int: %f ms %f Gflops %s: \n"
  246. "speedup(%s/arm_common, %s/float): %f\n",
  247. M, K, N, float_used, computations / float_used, int_used,
  248. computations / int_used, algo, algo, algo, float_used / int_used);
  249. };
  250. run(256, 12 * 24, 256);
  251. //////////////////////// gemv //////////////////////////
  252. for (size_t M : {8, 64, 112, 256}) {
  253. for (size_t K : {8, 64, 112, 256}) {
  254. run(M, 1, K);
  255. }
  256. }
  257. //////////////////////// gemm //////////////////////////
  258. for (size_t M : {8, 64, 112, 256}) {
  259. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  260. for (size_t N :
  261. {1, 2, 3, 4, 8, 64, 112, 113, 114, 115, 256, 257, 258, 259}) {
  262. run(M, N, K);
  263. }
  264. }
  265. }
  266. }
  267. #if MGB_ENABLE_DOT
  268. void run_8x8x32_benchmark(const char* algo, Handle* handle) {
  269. constexpr size_t RUNS = 50;
  270. param::MatrixMul param;
  271. Benchmarker<MatrixMul> benchmarker_int8(handle);
  272. benchmarker_int8.set_before_exec_callback(AlgoChecker<MatrixMul>(algo));
  273. benchmarker_int8.set_times(RUNS)
  274. .set_dtype(0, dtype::Int8{})
  275. .set_dtype(1, dtype::Int8{})
  276. .set_dtype(2, dtype::Int32{})
  277. .set_param(param)
  278. .set_display(false);
  279. Benchmarker<MatrixMul> benchmarker_float(handle);
  280. benchmarker_float.set_display(false).set_times(RUNS);
  281. auto run = [&](size_t M, size_t N, size_t K) {
  282. auto int_used = benchmarker_int8.exec({{M, K}, {K, N}, {}}) / RUNS;
  283. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  284. float computations = 2.f * M * K * N * 1e-6;
  285. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops \n"
  286. "int: %f ms %f Gflops %s: \n"
  287. "speedup(%s/arm_common, %s/float): %f\n",
  288. M, K, N, float_used, computations / float_used, int_used,
  289. computations / int_used, algo, algo, algo, float_used / int_used);
  290. };
  291. run(256, 12 * 24, 256);
  292. //////////////////////// gemm //////////////////////////
  293. for (size_t M : {8, 64, 112, 256}) {
  294. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  295. for (size_t N : {113, 114, 115, 256, 1024}) {
  296. run(M, N, K);
  297. }
  298. }
  299. }
  300. }
  301. void run_8x8x32_quint_benchmark(Handle* handle) {
  302. constexpr size_t RUNS = 50;
  303. param::MatrixMul param;
  304. Benchmarker<MatrixMul> benchmarker_quint8_dot(handle);
  305. benchmarker_quint8_dot.set_before_exec_callback(
  306. AlgoChecker<MatrixMul>("AARCH32_QUINT8_K4X8X4"));
  307. benchmarker_quint8_dot.set_times(RUNS)
  308. .set_dtype(0, dtype::Quantized8Asymm(2.3f, static_cast<uint8_t>(20)))
  309. .set_dtype(1, dtype::Quantized8Asymm(3.1f, static_cast<uint8_t>(30)))
  310. .set_dtype(2, dtype::QuantizedS32(2.3f * 3.1f))
  311. .set_param(param)
  312. .set_display(false);
  313. Benchmarker<MatrixMul> benchmarker_quint8(handle);
  314. benchmarker_quint8.set_before_exec_callback(
  315. AlgoChecker<MatrixMul>("ARMV7_QUINT8_K4X8X8"));
  316. benchmarker_quint8.set_times(RUNS)
  317. .set_dtype(0, dtype::Quantized8Asymm(2.3f, static_cast<uint8_t>(20)))
  318. .set_dtype(1, dtype::Quantized8Asymm(3.1f, static_cast<uint8_t>(30)))
  319. .set_dtype(2, dtype::QuantizedS32(2.3f * 3.1f))
  320. .set_param(param)
  321. .set_display(false);
  322. auto run = [&](size_t M, size_t N, size_t K) {
  323. auto dot_used = benchmarker_quint8_dot.exec({{M, K}, {K, N}, {}}) / RUNS;
  324. auto normal_used = benchmarker_quint8.exec({{M, K}, {K, N}, {}}) / RUNS;
  325. float computations = 2.f * M * K * N * 1e-6;
  326. printf("run: {%zu{M} %zu{K} %zu{N}} dot: %f ms %f Gflops \n"
  327. "normal: %f ms %f Gflops.speedup: %f\n",
  328. M, K, N, dot_used, computations / dot_used, normal_used,
  329. computations / normal_used, normal_used / dot_used);
  330. };
  331. run(256, 12 * 24, 256);
  332. //////////////////////// gemm //////////////////////////
  333. for (size_t M : {8, 64, 112, 256}) {
  334. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  335. for (size_t N : {113, 114, 115, 256, 1024}) {
  336. run(M, N, K);
  337. }
  338. }
  339. }
  340. }
  341. #endif
  342. } // namespace
  343. #if MGB_ENABLE_DOT
  344. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x32_K6x8x4) {
  345. run_8x8x32_benchmark("AARCH32_INT8_K6X8X4", handle());
  346. }
  347. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_QUINT8x8x32_K4x8x4) {
  348. run_8x8x32_quint_benchmark(handle());
  349. }
  350. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x32_MK4_DOT) {
  351. constexpr size_t RUNS = 50;
  352. param::MatrixMul param;
  353. Benchmarker<MatrixMul> benchmarker_default(handle());
  354. benchmarker_default.set_times(RUNS)
  355. .set_dtype(0, dtype::Int8())
  356. .set_dtype(1, dtype::Int8())
  357. .set_dtype(2, dtype::Int32())
  358. .set_param(param)
  359. .set_display(false);
  360. benchmarker_default.set_before_exec_callback(
  361. AlgoChecker<MatrixMul>("AARCH32_INT8_K6X8X4"));
  362. param.format = MatrixMul::Param::Format::MK4_DOT;
  363. Benchmarker<MatrixMul> benchmarker_mk4_dot(handle());
  364. benchmarker_mk4_dot.set_before_exec_callback(
  365. AlgoChecker<MatrixMul>("AARCH32_INT8_MK4_8X4X4_DOTPROD"));
  366. benchmarker_mk4_dot.set_param(param)
  367. .set_dtype(0, dtype::Int8())
  368. .set_dtype(1, dtype::Int8())
  369. .set_dtype(2, dtype::Int32())
  370. .set_display(false)
  371. .set_times(RUNS);
  372. auto run = [&](size_t M, size_t N, size_t K) {
  373. auto default_used = benchmarker_default.exec({{M, K}, {K, N}, {}}) / RUNS;
  374. auto mk4_dot_used =
  375. benchmarker_mk4_dot.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  376. RUNS;
  377. float computations = 2.f * M * K * N * 1e-6;
  378. printf("run: {%zu{M} %zu{K} %zu{N}} default: %f ms %f Gflops mk4_dot: "
  379. "%f ms "
  380. "%f Gflops speedup: %f\n",
  381. M, K, N, default_used, computations / default_used, mk4_dot_used,
  382. computations / mk4_dot_used, default_used / mk4_dot_used);
  383. };
  384. for (size_t M = 4; M < 512; M *= 2) {
  385. for (size_t K = 4; K < 512; K *= 2) {
  386. for (size_t N : {4, 8, 33, 113, 128}) {
  387. run(M, N, K);
  388. }
  389. }
  390. }
  391. }
  392. #endif
  393. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x2x16) {
  394. run_8x8x16_benchmark("ARMV7_INT8X8X16_K4X2X16", handle());
  395. }
  396. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8) {
  397. run_8x8x16_benchmark("ARMV7_INT8X8X16_K4X8X8", handle());
  398. }
  399. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K8x8x4) {
  400. run_8x8x16_benchmark("ARMV7_INT8X8X16_K8X8X4", handle());
  401. }
  402. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_MK4_K4x8x8) {
  403. run_8x8x16_benchmark(
  404. "ARMV7_INT8X8X16_MK4_K8X8X4", handle(), MatrixMul::Param::Format::MK4);
  405. }
  406. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT16x16x32_K12x4x1) {
  407. run_16x16x32_benchmark("ARMV7_INT16X16X32_K12X4X1", handle());
  408. }
  409. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K8x8x4_CONTRAST) {
  410. run_8x8x16_contrast("ARM_COMMON_INT8X8X16", "ARMV7_INT8X8X16_K8X8X4", handle());
  411. }
  412. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8_CONTRAST) {
  413. run_8x8x16_contrast("ARM_COMMON_INT8X8X16", "ARMV7_INT8X8X16_K4X8X8", handle());
  414. }
  415. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8_K8x8x4_CONTRAST) {
  416. run_8x8x16_contrast("ARMV7_INT8X8X16_K4X8X8", "ARMV7_INT8X8X16_K8X8X4", handle());
  417. }
  418. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  419. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_FP16) {
  420. constexpr size_t RUNS = 50;
  421. param::MatrixMul param;
  422. Benchmarker<MatrixMul> benchmarker_fp16(handle());
  423. benchmarker_fp16.set_times(RUNS)
  424. .set_dtype(0, dtype::Float16())
  425. .set_dtype(1, dtype::Float16())
  426. .set_dtype(2, dtype::Float16())
  427. .set_param(param)
  428. .set_display(false);
  429. Benchmarker<MatrixMul> benchmarker_float(handle());
  430. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  431. auto run = [&](size_t M, size_t N, size_t K) {
  432. auto fp16_used = benchmarker_fp16.exec({{M, K}, {K, N}, {}}) / RUNS;
  433. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  434. float computations = 2.f * M * K * N * 1e-6;
  435. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops fp16: %f ms "
  436. "%f Gflops speedup: %f\n",
  437. M, K, N, float_used, computations / float_used, fp16_used,
  438. computations / fp16_used, float_used / fp16_used);
  439. };
  440. run(256, 12 * 24, 256);
  441. for (size_t M : {8, 64, 112, 256}) {
  442. for (size_t K : {8, 64, 112, 256}) {
  443. for (size_t N : {8, 64, 112, 256}) {
  444. run(M, N, K);
  445. }
  446. }
  447. }
  448. }
  449. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_F16_MK8) {
  450. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(4);
  451. matrix_mul::benchmark_with_contrast(
  452. handle(), args, dtype::Float16{}, dtype::Float16{}, dtype::Float16{},
  453. "AARCH32_F16_MK8_4X8", param::MatrixMul::Format::MK8, dtype::Float16{},
  454. dtype::Float16{}, dtype::Float16{}, "AARCH32_F16_K4X16X1");
  455. }
  456. #endif
  457. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_MK4) {
  458. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  459. matrix_mul::benchmark_with_contrast(
  460. handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
  461. "ARMV7_F32_MK4_4x8", param::MatrixMul::Format::MK4, dtype::Float32{},
  462. dtype::Float32{}, dtype::Float32{});
  463. }
  464. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_PACK_MK4) {
  465. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  466. matrix_mul::benchmark_with_contrast(
  467. handle(), args, dtype::Float32{}, dtype::Float32{}, dtype::Float32{},
  468. "ARMV7_F32_MK4_PACK_4X12", param::MatrixMul::Format::MK4, dtype::Float32{},
  469. dtype::Float32{}, dtype::Float32{});
  470. }
  471. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
  472. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(4);
  473. matrix_mul::benchmark_with_contrast(
  474. handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  475. "ARMV7_INT16X16X32_MK8_4X8", param::MatrixMul::Format::MK8, dtype::Int16{},
  476. dtype::Int16{}, dtype::Int32{});
  477. }
  478. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT32_MK_4X2X16) {
  479. constexpr size_t RUNS = 50;
  480. param::MatrixMul param;
  481. param.transposeA = false;
  482. param.transposeB = false;
  483. Benchmarker<MatrixMul> benchmarker(handle());
  484. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  485. benchmarker.set_times(RUNS)
  486. .set_dtype(0, dtype::Int8{})
  487. .set_dtype(1, dtype::Int8{})
  488. .set_dtype(2, dtype::Int32{})
  489. .set_param(param)
  490. .set_display(false);
  491. benchmarker.set_before_exec_callback(
  492. AlgoChecker<MatrixMul>("ARMV7_INT8X8X32_K4X2X16"));
  493. param.format = MatrixMul::Param::Format::MK4;
  494. benchmarker_mk4.set_before_exec_callback(
  495. AlgoChecker<MatrixMul>("ARMV7_INT8X8X32_MK4_4X2X16"));
  496. benchmarker_mk4.set_times(RUNS)
  497. .set_dtype(0, dtype::Int8{})
  498. .set_dtype(1, dtype::Int8{})
  499. .set_dtype(2, dtype::Int32{})
  500. .set_param(param)
  501. .set_display(false);
  502. auto run = [&](size_t M, size_t N, size_t K) {
  503. auto mk_used =
  504. benchmarker_mk4.exec({{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) / RUNS;
  505. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  506. float computations = 2.f * M * K * N * 1e-6;
  507. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  508. "%f Gflops speedup_vs_normal: %f\n",
  509. M, K, N, default_used, computations / default_used, mk_used,
  510. computations / mk_used, default_used / mk_used);
  511. };
  512. run(256, 256, 128);
  513. for (size_t k = 4; k <= 512; k *= 2) {
  514. for (size_t m = 4; m <= 512; m *= 2) {
  515. for (size_t n = 4; n <= 512; n *= 2) {
  516. run(m, n, k);
  517. }
  518. }
  519. std::cout << std::endl;
  520. }
  521. }
  522. #endif
  523. // vim: syntax=cpp.doxygen