You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603
  1. /**
  2. * \file dnn/test/armv7/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/armv7/fixture.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/matrix_mul.h"
  16. #include "test/common/rng.h"
  17. using namespace megdnn;
  18. using namespace test;
  19. TEST_F(ARMV7, MATRIX_MUL) {
  20. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  21. dtype::Float32{}, handle(), "ARMV7_F32");
  22. }
  23. TEST_F(ARMV7, MATRIX_MUL_MK4) {
  24. matrix_mul::check_matrix_mul(
  25. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  26. "ARMV7_F32_MK4_4x8", param::MatrixMul::Format::MK4, 1);
  27. }
  28. TEST_F(ARMV7, MATRIX_MUL_PACK_MK4) {
  29. matrix_mul::check_matrix_mul(
  30. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  31. "ARMV7_F32_MK4_PACK_4X12", param::MatrixMul::Format::MK4, 1);
  32. }
  33. TEST_F(ARMV7, MATRIX_MUL_MK4_INT8) {
  34. std::vector<matrix_mul::TestArg> args;
  35. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  36. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  37. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  38. args.emplace_back(m, n, k, 0);
  39. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  40. handle(), "ARMV7_INT8X8X32_MK4_4X2X16",
  41. param::MatrixMul::Format::MK4, 1, 1e-3,
  42. std::move(args));
  43. }
  44. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_K4x8x8) {
  45. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  46. handle(), "ARMV7_INT8X8X16_K4X8X8");
  47. }
  48. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_K8x8x4) {
  49. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  50. handle(), "ARMV7_INT8X8X16_K8X8X4");
  51. }
  52. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_MK4_K8x8x4) {
  53. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  54. handle(), "ARMV7_INT8X8X16_MK4_K8X8X4",
  55. param::MatrixMul::Format::MK4, 1);
  56. }
  57. TEST_F(ARMV7, MATRIX_MUL_INT16x16x32) {
  58. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  59. handle(), "ARMV7_INT16X16X32_K12X4X1");
  60. }
  61. TEST_F(ARMV7, MATRIX_MUL_INT16x16x32_MK8) {
  62. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  63. handle(), "ARMV7_INT16X16X32_MK8_4X8",
  64. param::MatrixMul::Format::MK8, 1);
  65. }
  66. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  67. TEST_F(ARMV7, MATRIX_MUL_FP16) {
  68. matrix_mul::check_matrix_mul(dtype::Float16{}, dtype::Float16{},
  69. dtype::Float16{}, handle(),
  70. "AARCH32_F16_K4X16X1");
  71. }
  72. TEST_F(ARMV7, MATRIX_MUL_F16_MK8) {
  73. matrix_mul::check_matrix_mul(
  74. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  75. "AARCH32_F16_MK8_4X8", param::MatrixMul::Format::MK8, 1);
  76. }
  77. #endif
  78. #if MGB_ENABLE_DOT
  79. TEST_F(ARMV7, MATRIX_MUL_SDOT) {
  80. matrix_mul::check_matrix_mul(dtype::Int8(), dtype::Int8(), dtype::Int32(),
  81. handle(), "AARCH32_INT8_K6X8X4");
  82. }
  83. TEST_F(ARMV7, MATRIX_MUL_UDOT) {
  84. matrix_mul::check_matrix_mul(
  85. dtype::Quantized8Asymm(4.0f, static_cast<uint8_t>(10)),
  86. dtype::Quantized8Asymm(3.0f, static_cast<uint8_t>(54)),
  87. dtype::QuantizedS32(12.0f), handle(), "AARCH32_QUINT8_K4X8X4");
  88. }
  89. TEST_F(ARMV7, MATRIX_MUL_MK4_DOT_INT8) {
  90. std::vector<matrix_mul::TestArg> args;
  91. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  92. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  93. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  94. args.emplace_back(m, n, k, 0);
  95. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  96. handle(), "AARCH32_INT8_MK4_8X4X4_DOTPROD",
  97. param::MatrixMul::Format::MK4_DOT, 1, 1e-3,
  98. std::move(args));
  99. }
  100. #endif
  101. #if MEGDNN_WITH_BENCHMARK
  102. namespace {
  103. void run_8x8x16_benchmark(
  104. const char* algo, Handle* handle,
  105. MatrixMul::Param::Format format = MatrixMul::Param::Format::DEFAULT) {
  106. constexpr size_t RUNS = 50;
  107. param::MatrixMul param;
  108. Benchmarker<MatrixMul> benchmarker_int(handle);
  109. Benchmarker<MatrixMul> benchmarker_int_kern_4x2x16(handle);
  110. benchmarker_int.set_before_exec_callback(
  111. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X16"));
  112. benchmarker_int.set_times(RUNS)
  113. .set_dtype(0, dtype::Int8{})
  114. .set_dtype(1, dtype::Int8{})
  115. .set_dtype(2, dtype::Int16{})
  116. .set_param(param)
  117. .set_display(false);
  118. param::MatrixMul target_param;
  119. target_param.format = format;
  120. benchmarker_int_kern_4x2x16.set_before_exec_callback(
  121. AlgoChecker<MatrixMul>(algo));
  122. benchmarker_int_kern_4x2x16.set_times(RUNS)
  123. .set_dtype(0, dtype::Int8{})
  124. .set_dtype(1, dtype::Int8{})
  125. .set_dtype(2, dtype::Int16{})
  126. .set_param(target_param)
  127. .set_display(false);
  128. Benchmarker<MatrixMul> benchmarker_float(handle);
  129. benchmarker_float.set_display(false).set_times(RUNS);
  130. auto run = [&](size_t M, size_t N, size_t K) {
  131. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  132. auto int_kern_used = 1e10;
  133. if (format == MatrixMul::Param::Format::MK4) {
  134. int_kern_used = benchmarker_int_kern_4x2x16.exec(
  135. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  136. RUNS;
  137. } else {
  138. int_kern_used =
  139. benchmarker_int_kern_4x2x16.exec({{M, K}, {K, N}, {}}) /
  140. RUNS;
  141. }
  142. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  143. float computations = 2.f * M * K * N * 1e-6;
  144. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f "
  145. "ms "
  146. "%f Gflops %s: %f ms %f Gflops "
  147. "speedup(%s/arm_common, %s/float): %f "
  148. "%f\n",
  149. M, K, N, float_used, computations / float_used, int_used,
  150. computations / int_used, algo, int_kern_used,
  151. computations / int_kern_used, algo, algo,
  152. int_used / int_kern_used, float_used / int_kern_used);
  153. };
  154. run(256, 12 * 24, 256);
  155. run(256, 256, 256);
  156. //////////////////////// gemv //////////////////////////
  157. for (size_t M : {8, 64, 112, 256}) {
  158. for (size_t K : {8, 64, 112, 256}) {
  159. run(M, 1, K);
  160. }
  161. }
  162. //////////////////////// gemm //////////////////////////
  163. for (size_t M : {8, 64, 112, 256}) {
  164. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  165. for (size_t N : {8, 64, 112, 256}) {
  166. run(M, N, K);
  167. }
  168. }
  169. }
  170. }
  171. void run_8x8x16_contrast(
  172. const char* algo0, const char* algo, Handle* handle,
  173. MatrixMul::Param::Format format = MatrixMul::Param::Format::DEFAULT) {
  174. constexpr size_t RUNS = 100;
  175. param::MatrixMul param;
  176. Benchmarker<MatrixMul> benchmarker_int(handle);
  177. Benchmarker<MatrixMul> benchmarker_int_kern_4x2x16(handle);
  178. benchmarker_int.set_before_exec_callback(AlgoChecker<MatrixMul>(algo0));
  179. benchmarker_int.set_times(RUNS)
  180. .set_dtype(0, dtype::Int8{})
  181. .set_dtype(1, dtype::Int8{})
  182. .set_dtype(2, dtype::Int16{})
  183. .set_param(param)
  184. .set_display(false);
  185. param::MatrixMul target_param;
  186. target_param.format = format;
  187. benchmarker_int_kern_4x2x16.set_before_exec_callback(
  188. AlgoChecker<MatrixMul>(algo));
  189. benchmarker_int_kern_4x2x16.set_times(RUNS)
  190. .set_dtype(0, dtype::Int8{})
  191. .set_dtype(1, dtype::Int8{})
  192. .set_dtype(2, dtype::Int16{})
  193. .set_param(target_param)
  194. .set_display(false);
  195. auto run = [&](size_t M, size_t N, size_t K) {
  196. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  197. auto int_kern_used = 1e10;
  198. double computation = 2.0f * M * N * K * 1e-6;
  199. if (format == MatrixMul::Param::Format::MK4) {
  200. int_kern_used = benchmarker_int_kern_4x2x16.exec(
  201. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  202. RUNS;
  203. } else {
  204. int_kern_used =
  205. benchmarker_int_kern_4x2x16.exec({{M, K}, {K, N}, {}}) /
  206. RUNS;
  207. }
  208. printf(" %f(%f)\t %f(%f)\t %f\n", int_used, computation / int_used,
  209. int_kern_used, computation / int_kern_used,
  210. int_used / int_kern_used);
  211. };
  212. printf("\nN\t K\t M\t %s ms(GFlops)\t %s ms(GFlops)\t SPEEDUP\n", algo0,
  213. algo);
  214. for (size_t M : {8}) {
  215. for (size_t K : {72}) {
  216. for (size_t N : {8, 16, 32, 64, 72, 128, 256, 512, 1024, 4096, 8192,
  217. 16384, 32768, 65536}) {
  218. printf("%zu\t %zu\t %zu\t", N, K, M);
  219. run(M, N, K);
  220. }
  221. }
  222. }
  223. printf("512\t 512\t 512\t");
  224. run(512, 512, 512);
  225. }
  226. void run_16x16x32_benchmark(const char* algo, Handle* handle) {
  227. constexpr size_t RUNS = 50;
  228. param::MatrixMul param;
  229. Benchmarker<MatrixMul> benchmarker_int(handle);
  230. benchmarker_int.set_before_exec_callback(
  231. AlgoChecker<MatrixMul>("ARMV7_INT16X16X32_K12X4X1"));
  232. benchmarker_int.set_times(RUNS)
  233. .set_dtype(0, dtype::Int16{})
  234. .set_dtype(1, dtype::Int16{})
  235. .set_dtype(2, dtype::Int32{})
  236. .set_param(param)
  237. .set_display(false);
  238. Benchmarker<MatrixMul> benchmarker_float(handle);
  239. benchmarker_float.set_display(false).set_times(RUNS);
  240. auto run = [&](size_t M, size_t N, size_t K) {
  241. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  242. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  243. float computations = 2.f * M * K * N * 1e-6;
  244. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops \n"
  245. "int: %f ms %f Gflops %s: \n"
  246. "speedup(%s/arm_common, %s/float): %f\n",
  247. M, K, N, float_used, computations / float_used, int_used,
  248. computations / int_used, algo, algo, algo,
  249. float_used / int_used);
  250. };
  251. run(256, 12 * 24, 256);
  252. //////////////////////// gemv //////////////////////////
  253. for (size_t M : {8, 64, 112, 256}) {
  254. for (size_t K : {8, 64, 112, 256}) {
  255. run(M, 1, K);
  256. }
  257. }
  258. //////////////////////// gemm //////////////////////////
  259. for (size_t M : {8, 64, 112, 256}) {
  260. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  261. for (size_t N :
  262. {1, 2, 3, 4, 8, 64, 112, 113, 114, 115, 256, 257, 258, 259}) {
  263. run(M, N, K);
  264. }
  265. }
  266. }
  267. }
  268. #if MGB_ENABLE_DOT
  269. void run_8x8x32_benchmark(const char* algo, Handle* handle) {
  270. constexpr size_t RUNS = 50;
  271. param::MatrixMul param;
  272. Benchmarker<MatrixMul> benchmarker_int8(handle);
  273. benchmarker_int8.set_before_exec_callback(AlgoChecker<MatrixMul>(algo));
  274. benchmarker_int8.set_times(RUNS)
  275. .set_dtype(0, dtype::Int8{})
  276. .set_dtype(1, dtype::Int8{})
  277. .set_dtype(2, dtype::Int32{})
  278. .set_param(param)
  279. .set_display(false);
  280. Benchmarker<MatrixMul> benchmarker_float(handle);
  281. benchmarker_float.set_display(false).set_times(RUNS);
  282. auto run = [&](size_t M, size_t N, size_t K) {
  283. auto int_used = benchmarker_int8.exec({{M, K}, {K, N}, {}}) / RUNS;
  284. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  285. float computations = 2.f * M * K * N * 1e-6;
  286. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops \n"
  287. "int: %f ms %f Gflops %s: \n"
  288. "speedup(%s/arm_common, %s/float): %f\n",
  289. M, K, N, float_used, computations / float_used, int_used,
  290. computations / int_used, algo, algo, algo,
  291. float_used / int_used);
  292. };
  293. run(256, 12 * 24, 256);
  294. //////////////////////// gemm //////////////////////////
  295. for (size_t M : {8, 64, 112, 256}) {
  296. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  297. for (size_t N : {113, 114, 115, 256, 1024}) {
  298. run(M, N, K);
  299. }
  300. }
  301. }
  302. }
  303. void run_8x8x32_quint_benchmark(Handle* handle) {
  304. constexpr size_t RUNS = 50;
  305. param::MatrixMul param;
  306. Benchmarker<MatrixMul> benchmarker_quint8_dot(handle);
  307. benchmarker_quint8_dot.set_before_exec_callback(
  308. AlgoChecker<MatrixMul>("AARCH32_QUINT8_K4X8X4"));
  309. benchmarker_quint8_dot.set_times(RUNS)
  310. .set_dtype(0,
  311. dtype::Quantized8Asymm(2.3f, static_cast<uint8_t>(20)))
  312. .set_dtype(1,
  313. dtype::Quantized8Asymm(3.1f, static_cast<uint8_t>(30)))
  314. .set_dtype(2, dtype::QuantizedS32(2.3f * 3.1f))
  315. .set_param(param)
  316. .set_display(false);
  317. Benchmarker<MatrixMul> benchmarker_quint8(handle);
  318. benchmarker_quint8.set_before_exec_callback(
  319. AlgoChecker<MatrixMul>("ARMV7_QUINT8_K4X8X8"));
  320. benchmarker_quint8.set_times(RUNS)
  321. .set_dtype(0,
  322. dtype::Quantized8Asymm(2.3f, static_cast<uint8_t>(20)))
  323. .set_dtype(1,
  324. dtype::Quantized8Asymm(3.1f, static_cast<uint8_t>(30)))
  325. .set_dtype(2, dtype::QuantizedS32(2.3f * 3.1f))
  326. .set_param(param)
  327. .set_display(false);
  328. auto run = [&](size_t M, size_t N, size_t K) {
  329. auto dot_used =
  330. benchmarker_quint8_dot.exec({{M, K}, {K, N}, {}}) / RUNS;
  331. auto normal_used = benchmarker_quint8.exec({{M, K}, {K, N}, {}}) / RUNS;
  332. float computations = 2.f * M * K * N * 1e-6;
  333. printf("run: {%zu{M} %zu{K} %zu{N}} dot: %f ms %f Gflops \n"
  334. "normal: %f ms %f Gflops.speedup: %f\n",
  335. M, K, N, dot_used, computations / dot_used, normal_used,
  336. computations / normal_used, normal_used / dot_used);
  337. };
  338. run(256, 12 * 24, 256);
  339. //////////////////////// gemm //////////////////////////
  340. for (size_t M : {8, 64, 112, 256}) {
  341. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  342. for (size_t N : {113, 114, 115, 256, 1024}) {
  343. run(M, N, K);
  344. }
  345. }
  346. }
  347. }
  348. #endif
  349. } // namespace
  350. #if MGB_ENABLE_DOT
  351. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x32_K6x8x4) {
  352. run_8x8x32_benchmark("AARCH32_INT8_K6X8X4", handle());
  353. }
  354. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_QUINT8x8x32_K4x8x4) {
  355. run_8x8x32_quint_benchmark(handle());
  356. }
  357. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x32_MK4_DOT) {
  358. constexpr size_t RUNS = 50;
  359. param::MatrixMul param;
  360. Benchmarker<MatrixMul> benchmarker_default(handle());
  361. benchmarker_default.set_times(RUNS)
  362. .set_dtype(0, dtype::Int8())
  363. .set_dtype(1, dtype::Int8())
  364. .set_dtype(2, dtype::Int32())
  365. .set_param(param)
  366. .set_display(false);
  367. benchmarker_default.set_before_exec_callback(
  368. AlgoChecker<MatrixMul>("AARCH32_INT8_K6X8X4"));
  369. param.format = MatrixMul::Param::Format::MK4_DOT;
  370. Benchmarker<MatrixMul> benchmarker_mk4_dot(handle());
  371. benchmarker_mk4_dot.set_before_exec_callback(
  372. AlgoChecker<MatrixMul>("AARCH32_INT8_MK4_8X4X4_DOTPROD"));
  373. benchmarker_mk4_dot.set_param(param)
  374. .set_dtype(0, dtype::Int8())
  375. .set_dtype(1, dtype::Int8())
  376. .set_dtype(2, dtype::Int32())
  377. .set_display(false)
  378. .set_times(RUNS);
  379. auto run = [&](size_t M, size_t N, size_t K) {
  380. auto default_used =
  381. benchmarker_default.exec({{M, K}, {K, N}, {}}) / RUNS;
  382. auto mk4_dot_used = benchmarker_mk4_dot.exec(
  383. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  384. RUNS;
  385. float computations = 2.f * M * K * N * 1e-6;
  386. printf("run: {%zu{M} %zu{K} %zu{N}} default: %f ms %f Gflops mk4_dot: "
  387. "%f ms "
  388. "%f Gflops speedup: %f\n",
  389. M, K, N, default_used, computations / default_used, mk4_dot_used,
  390. computations / mk4_dot_used, default_used / mk4_dot_used);
  391. };
  392. for (size_t M = 4; M < 512; M *= 2) {
  393. for (size_t K = 4; K < 512; K *= 2) {
  394. for (size_t N : {4, 8, 33, 113, 128}) {
  395. run(M, N, K);
  396. }
  397. }
  398. }
  399. }
  400. #endif
  401. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x2x16) {
  402. run_8x8x16_benchmark("ARMV7_INT8X8X16_K4X2X16", handle());
  403. }
  404. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8) {
  405. run_8x8x16_benchmark("ARMV7_INT8X8X16_K4X8X8", handle());
  406. }
  407. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K8x8x4) {
  408. run_8x8x16_benchmark("ARMV7_INT8X8X16_K8X8X4", handle());
  409. }
  410. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_MK4_K4x8x8) {
  411. run_8x8x16_benchmark("ARMV7_INT8X8X16_MK4_K8X8X4", handle(),
  412. MatrixMul::Param::Format::MK4);
  413. }
  414. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT16x16x32_K12x4x1) {
  415. run_16x16x32_benchmark("ARMV7_INT16X16X32_K12X4X1", handle());
  416. }
  417. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K8x8x4_CONTRAST) {
  418. run_8x8x16_contrast("ARM_COMMON_INT8X8X16", "ARMV7_INT8X8X16_K8X8X4",
  419. handle());
  420. }
  421. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8_CONTRAST) {
  422. run_8x8x16_contrast("ARM_COMMON_INT8X8X16", "ARMV7_INT8X8X16_K4X8X8",
  423. handle());
  424. }
  425. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8_K8x8x4_CONTRAST) {
  426. run_8x8x16_contrast("ARMV7_INT8X8X16_K4X8X8", "ARMV7_INT8X8X16_K8X8X4",
  427. handle());
  428. }
  429. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  430. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_FP16) {
  431. constexpr size_t RUNS = 50;
  432. param::MatrixMul param;
  433. Benchmarker<MatrixMul> benchmarker_fp16(handle());
  434. benchmarker_fp16.set_times(RUNS)
  435. .set_dtype(0, dtype::Float16())
  436. .set_dtype(1, dtype::Float16())
  437. .set_dtype(2, dtype::Float16())
  438. .set_param(param)
  439. .set_display(false);
  440. Benchmarker<MatrixMul> benchmarker_float(handle());
  441. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  442. auto run = [&](size_t M, size_t N, size_t K) {
  443. auto fp16_used = benchmarker_fp16.exec({{M, K}, {K, N}, {}}) / RUNS;
  444. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  445. float computations = 2.f * M * K * N * 1e-6;
  446. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops fp16: %f ms "
  447. "%f Gflops speedup: %f\n",
  448. M, K, N, float_used, computations / float_used, fp16_used,
  449. computations / fp16_used, float_used / fp16_used);
  450. };
  451. run(256, 12 * 24, 256);
  452. for (size_t M : {8, 64, 112, 256}) {
  453. for (size_t K : {8, 64, 112, 256}) {
  454. for (size_t N : {8, 64, 112, 256}) {
  455. run(M, N, K);
  456. }
  457. }
  458. }
  459. }
  460. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_F16_MK8) {
  461. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(4);
  462. matrix_mul::benchmark_with_contrast(
  463. handle(), args, dtype::Float16{}, dtype::Float16{},
  464. dtype::Float16{}, "AARCH32_F16_MK8_4X8",
  465. param::MatrixMul::Format::MK8, dtype::Float16{}, dtype::Float16{},
  466. dtype::Float16{}, "AARCH32_F16_K4X16X1");
  467. }
  468. #endif
  469. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_MK4) {
  470. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  471. matrix_mul::benchmark_with_contrast(
  472. handle(), args, dtype::Float32{}, dtype::Float32{},
  473. dtype::Float32{}, "ARMV7_F32_MK4_4x8",
  474. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  475. dtype::Float32{});
  476. }
  477. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_PACK_MK4) {
  478. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  479. matrix_mul::benchmark_with_contrast(
  480. handle(), args, dtype::Float32{}, dtype::Float32{},
  481. dtype::Float32{}, "ARMV7_F32_MK4_PACK_4X12",
  482. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  483. dtype::Float32{});
  484. }
  485. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
  486. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(4);
  487. matrix_mul::benchmark_with_contrast(
  488. handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  489. "ARMV7_INT16X16X32_MK8_4X8", param::MatrixMul::Format::MK8,
  490. dtype::Int16{}, dtype::Int16{}, dtype::Int32{});
  491. }
  492. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT32_MK_4X2X16) {
  493. constexpr size_t RUNS = 50;
  494. param::MatrixMul param;
  495. param.transposeA = false;
  496. param.transposeB = false;
  497. Benchmarker<MatrixMul> benchmarker(handle());
  498. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  499. benchmarker.set_times(RUNS)
  500. .set_dtype(0, dtype::Int8{})
  501. .set_dtype(1, dtype::Int8{})
  502. .set_dtype(2, dtype::Int32{})
  503. .set_param(param)
  504. .set_display(false);
  505. benchmarker.set_before_exec_callback(
  506. AlgoChecker<MatrixMul>("ARMV7_INT8X8X32_K4X2X16"));
  507. param.format = MatrixMul::Param::Format::MK4;
  508. benchmarker_mk4.set_before_exec_callback(
  509. AlgoChecker<MatrixMul>("ARMV7_INT8X8X32_MK4_4X2X16"));
  510. benchmarker_mk4.set_times(RUNS)
  511. .set_dtype(0, dtype::Int8{})
  512. .set_dtype(1, dtype::Int8{})
  513. .set_dtype(2, dtype::Int32{})
  514. .set_param(param)
  515. .set_display(false);
  516. auto run = [&](size_t M, size_t N, size_t K) {
  517. auto mk_used = benchmarker_mk4.exec(
  518. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  519. RUNS;
  520. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  521. float computations = 2.f * M * K * N * 1e-6;
  522. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  523. "%f Gflops speedup_vs_normal: %f\n",
  524. M, K, N, default_used, computations / default_used, mk_used,
  525. computations / mk_used, default_used / mk_used);
  526. };
  527. run(256, 256, 128);
  528. for (size_t k = 4; k <= 512; k *= 2) {
  529. for (size_t m = 4; m <= 512; m *= 2) {
  530. for (size_t n = 4; n <= 512; n *= 2) {
  531. run(m, n, k);
  532. }
  533. }
  534. std::cout << std::endl;
  535. }
  536. }
  537. #endif
  538. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台