You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. /**
  2. * \file dnn/test/armv7/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/armv7/fixture.h"
  12. #include "test/common/benchmarker.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/matrix_mul.h"
  15. #include "test/common/rng.h"
  16. using namespace megdnn;
  17. using namespace test;
  18. TEST_F(ARMV7, MATRIX_MUL) {
  19. matrix_mul::check_matrix_mul(dtype::Float32{}, dtype::Float32{},
  20. dtype::Float32{}, handle(), "ARMV7_F32");
  21. }
  22. TEST_F(ARMV7, MATRIX_MUL_MK4) {
  23. matrix_mul::check_matrix_mul(
  24. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  25. "ARMV7_F32_MK4_4x8", param::MatrixMul::Format::MK4, 4);
  26. }
  27. TEST_F(ARMV7, MATRIX_MUL_PACK_MK4) {
  28. matrix_mul::check_matrix_mul(
  29. dtype::Float32{}, dtype::Float32{}, dtype::Float32{}, handle(),
  30. "ARMV7_F32_MK4_PACK_4X12", param::MatrixMul::Format::MK4, 1);
  31. }
  32. TEST_F(ARMV7, MATRIX_MUL_MK4_INT8) {
  33. std::vector<matrix_mul::TestArg> args;
  34. for (size_t m : {1, 2, 3, 4, 5, 7, 10, 11})
  35. for (size_t n : {1, 2, 3, 4, 5, 8, 16, 24, 25, 32})
  36. for (size_t k : {1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 33, 34})
  37. args.emplace_back(m, n, k, 0);
  38. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  39. handle(), "ARMV7_INT8X8X32_MK4_4X2X16",
  40. param::MatrixMul::Format::MK4, 1, 1e-3,
  41. std::move(args));
  42. }
  43. TEST_F(ARMV7, MATRIX_MUL_INT8x8x16_K4x8x8) {
  44. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  45. handle(), "ARMV7_INT8X8X16_K4X8X8");
  46. }
  47. TEST_F(ARMV7, MATRIX_MUL_INT16x16x32) {
  48. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  49. handle(),"ARMV7_INT16X16X32_K12X4X1");
  50. }
  51. TEST_F(ARMV7, MATRIX_MUL_INT16x16x32_MK8) {
  52. matrix_mul::check_matrix_mul(dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  53. handle(), "ARMV7_INT16X16X32_MK8_4X8",
  54. param::MatrixMul::Format::MK8, 4);
  55. }
  56. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  57. TEST_F(ARMV7, MATRIX_MUL_FP16) {
  58. matrix_mul::check_matrix_mul(dtype::Float16{}, dtype::Float16{},
  59. dtype::Float16{}, handle(),
  60. "AARCH32_F16_K4X16X1");
  61. }
  62. TEST_F(ARMV7, MATRIX_MUL_F16_MK8) {
  63. matrix_mul::check_matrix_mul(
  64. dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, handle(),
  65. "AARCH32_F16_MK8_4X8", param::MatrixMul::Format::MK8, 4);
  66. }
  67. #endif
  68. #if __ARM_FEATURE_DOTPROD
  69. TEST_F(ARMV7, MATRIX_MUL_SDOT) {
  70. matrix_mul::check_matrix_mul(dtype::Int8(), dtype::Int8(), dtype::Int32(),
  71. handle(), "AARCH32_INT8_K6X8X4");
  72. }
  73. TEST_F(ARMV7, MATRIX_MUL_UDOT) {
  74. matrix_mul::check_matrix_mul(
  75. dtype::Quantized8Asymm(4.0f, static_cast<uint8_t>(10)), dtype::Quantized8Asymm(3.0f, static_cast<uint8_t>(54)),
  76. dtype::QuantizedS32(12.0f), handle(), "AARCH32_QUINT8_K4X8X4");
  77. }
  78. #endif
  79. #if MEGDNN_WITH_BENCHMARK
  80. namespace {
  81. void run_8x8x16_benchmark(const char* algo, Handle* handle) {
  82. constexpr size_t RUNS = 50;
  83. param::MatrixMul param;
  84. Benchmarker<MatrixMul> benchmarker_int(handle);
  85. Benchmarker<MatrixMul> benchmarker_int_kern_4x2x16(handle);
  86. benchmarker_int.set_before_exec_callback(
  87. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X16"));
  88. benchmarker_int.set_times(RUNS)
  89. .set_dtype(0, dtype::Int8{})
  90. .set_dtype(1, dtype::Int8{})
  91. .set_dtype(2, dtype::Int16{})
  92. .set_param(param)
  93. .set_display(false);
  94. benchmarker_int_kern_4x2x16.set_before_exec_callback(
  95. AlgoChecker<MatrixMul>(algo));
  96. benchmarker_int_kern_4x2x16.set_times(RUNS)
  97. .set_dtype(0, dtype::Int8{})
  98. .set_dtype(1, dtype::Int8{})
  99. .set_dtype(2, dtype::Int16{})
  100. .set_param(param)
  101. .set_display(false);
  102. Benchmarker<MatrixMul> benchmarker_float(handle);
  103. benchmarker_float.set_display(false).set_times(RUNS);
  104. auto run = [&](size_t M, size_t N, size_t K) {
  105. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  106. auto int_kern_used =
  107. benchmarker_int_kern_4x2x16.exec({{M, K}, {K, N}, {}}) / RUNS;
  108. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  109. float computations = 2.f * M * K * N * 1e-6;
  110. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f "
  111. "ms "
  112. "%f Gflops %s: %f ms %f Gflops "
  113. "speedup(%s/arm_common, %s/float): %f "
  114. "%f\n",
  115. M, K, N, float_used, computations / float_used, int_used,
  116. computations / int_used, algo, int_kern_used,
  117. computations / int_kern_used, algo, algo,
  118. int_used / int_kern_used, float_used / int_kern_used);
  119. };
  120. run(256, 12 * 24, 256);
  121. //////////////////////// gemv //////////////////////////
  122. for (size_t M : {8, 64, 112, 256}) {
  123. for (size_t K : {8, 64, 112, 256}) {
  124. run(M, 1, K);
  125. }
  126. }
  127. //////////////////////// gemm //////////////////////////
  128. for (size_t M : {8, 64, 112, 256}) {
  129. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  130. for (size_t N : {8, 64, 112, 256}) {
  131. run(M, N, K);
  132. }
  133. }
  134. }
  135. }
  136. void run_16x16x32_benchmark(const char* algo, Handle* handle) {
  137. constexpr size_t RUNS = 50;
  138. param::MatrixMul param;
  139. Benchmarker<MatrixMul> benchmarker_int(handle);
  140. benchmarker_int.set_before_exec_callback(
  141. AlgoChecker<MatrixMul>("ARMV7_INT16X16X32_K12X4X1"));
  142. benchmarker_int.set_times(RUNS)
  143. .set_dtype(0, dtype::Int16{})
  144. .set_dtype(1, dtype::Int16{})
  145. .set_dtype(2, dtype::Int32{})
  146. .set_param(param)
  147. .set_display(false);
  148. Benchmarker<MatrixMul> benchmarker_float(handle);
  149. benchmarker_float.set_display(false).set_times(RUNS);
  150. auto run = [&](size_t M, size_t N, size_t K) {
  151. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  152. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  153. float computations = 2.f * M * K * N * 1e-6;
  154. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops \n"
  155. "int: %f ms %f Gflops %s: \n"
  156. "speedup(%s/arm_common, %s/float): %f\n",
  157. M, K, N, float_used, computations / float_used, int_used,
  158. computations / int_used,algo,algo,algo,float_used / int_used);
  159. };
  160. run(256, 12 * 24, 256);
  161. //////////////////////// gemv //////////////////////////
  162. for (size_t M : {8, 64, 112, 256}) {
  163. for (size_t K : {8, 64, 112, 256}) {
  164. run(M, 1, K);
  165. }
  166. }
  167. //////////////////////// gemm //////////////////////////
  168. for (size_t M : {8, 64, 112, 256}) {
  169. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  170. for (size_t N :
  171. {1, 2, 3, 4, 8, 64, 112, 113, 114, 115, 256, 257, 258, 259}) {
  172. run(M, N, K);
  173. }
  174. }
  175. }
  176. }
  177. #if __ARM_FEATURE_DOTPROD
  178. void run_8x8x32_benchmark(const char* algo, Handle* handle) {
  179. constexpr size_t RUNS = 50;
  180. param::MatrixMul param;
  181. Benchmarker<MatrixMul> benchmarker_int8(handle);
  182. benchmarker_int8.set_before_exec_callback(AlgoChecker<MatrixMul>(algo));
  183. benchmarker_int8.set_times(RUNS)
  184. .set_dtype(0, dtype::Int8{})
  185. .set_dtype(1, dtype::Int8{})
  186. .set_dtype(2, dtype::Int32{})
  187. .set_param(param)
  188. .set_display(false);
  189. Benchmarker<MatrixMul> benchmarker_float(handle);
  190. benchmarker_float.set_display(false).set_times(RUNS);
  191. auto run = [&](size_t M, size_t N, size_t K) {
  192. auto int_used = benchmarker_int8.exec({{M, K}, {K, N}, {}}) / RUNS;
  193. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  194. float computations = 2.f * M * K * N * 1e-6;
  195. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops \n"
  196. "int: %f ms %f Gflops %s: \n"
  197. "speedup(%s/arm_common, %s/float): %f\n",
  198. M, K, N, float_used, computations / float_used, int_used,
  199. computations / int_used,algo,algo,algo,float_used / int_used);
  200. };
  201. run(256, 12 * 24, 256);
  202. //////////////////////// gemm //////////////////////////
  203. for (size_t M : {8, 64, 112, 256}) {
  204. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  205. for (size_t N : {113, 114, 115, 256, 1024}) {
  206. run(M, N, K);
  207. }
  208. }
  209. }
  210. }
  211. void run_8x8x32_quint_benchmark(Handle* handle) {
  212. constexpr size_t RUNS = 50;
  213. param::MatrixMul param;
  214. Benchmarker<MatrixMul> benchmarker_quint8_dot(handle);
  215. benchmarker_quint8_dot.set_before_exec_callback(
  216. AlgoChecker<MatrixMul>("AARCH32_QUINT8_K4X8X4"));
  217. benchmarker_quint8_dot.set_times(RUNS)
  218. .set_dtype(0, dtype::Quantized8Asymm(2.3f, static_cast<uint8_t>(20)))
  219. .set_dtype(1, dtype::Quantized8Asymm(3.1f, static_cast<uint8_t>(30)))
  220. .set_dtype(2, dtype::QuantizedS32(2.3f*3.1f))
  221. .set_param(param)
  222. .set_display(false);
  223. Benchmarker<MatrixMul> benchmarker_quint8(handle);
  224. benchmarker_quint8.set_before_exec_callback(
  225. AlgoChecker<MatrixMul>("ARMV7_QUINT8_K4X8X8"));
  226. benchmarker_quint8.set_times(RUNS)
  227. .set_dtype(0, dtype::Quantized8Asymm(2.3f, static_cast<uint8_t>(20)))
  228. .set_dtype(1, dtype::Quantized8Asymm(3.1f, static_cast<uint8_t>(30)))
  229. .set_dtype(2, dtype::QuantizedS32(2.3f*3.1f))
  230. .set_param(param)
  231. .set_display(false);
  232. auto run = [&](size_t M, size_t N, size_t K) {
  233. auto dot_used = benchmarker_quint8_dot.exec({{M, K}, {K, N}, {}}) / RUNS;
  234. auto normal_used = benchmarker_quint8.exec({{M, K}, {K, N}, {}}) / RUNS;
  235. float computations = 2.f * M * K * N * 1e-6;
  236. printf("run: {%zu{M} %zu{K} %zu{N}} dot: %f ms %f Gflops \n"
  237. "normal: %f ms %f Gflops.speedup: %f\n",
  238. M, K, N, dot_used, computations / dot_used, normal_used,
  239. computations / normal_used, normal_used / dot_used);
  240. };
  241. run(256, 12 * 24, 256);
  242. //////////////////////// gemm //////////////////////////
  243. for (size_t M : {8, 64, 112, 256}) {
  244. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  245. for (size_t N : {113, 114, 115, 256, 1024}) {
  246. run(M, N, K);
  247. }
  248. }
  249. }
  250. }
  251. #endif
  252. } // namespace
  253. #if __ARM_FEATURE_DOTPROD
  254. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x32_K6x8x4) {
  255. run_8x8x32_benchmark("AARCH32_INT8_K6X8X4", handle());
  256. }
  257. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_QUINT8x8x32_K4x8x4) {
  258. run_8x8x32_quint_benchmark(handle());
  259. }
  260. #endif
  261. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x2x16) {
  262. run_8x8x16_benchmark("ARMV7_INT8X8X16_K4X2X16", handle());
  263. }
  264. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT8x8x16_K4x8x8) {
  265. run_8x8x16_benchmark("ARMV7_INT8X8X16_K4X8X8", handle());
  266. }
  267. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT16x16x32_K12x4x1) {
  268. run_16x16x32_benchmark("ARMV7_INT16X16X32_K12X4X1", handle());
  269. }
  270. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  271. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_FP16) {
  272. constexpr size_t RUNS = 50;
  273. param::MatrixMul param;
  274. Benchmarker<MatrixMul> benchmarker_fp16(handle());
  275. benchmarker_fp16.set_times(RUNS)
  276. .set_dtype(0, dtype::Float16())
  277. .set_dtype(1, dtype::Float16())
  278. .set_dtype(2, dtype::Float16())
  279. .set_param(param)
  280. .set_display(false);
  281. Benchmarker<MatrixMul> benchmarker_float(handle());
  282. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  283. auto run = [&](size_t M, size_t N, size_t K) {
  284. auto fp16_used = benchmarker_fp16.exec({{M, K}, {K, N}, {}}) / RUNS;
  285. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  286. float computations = 2.f * M * K * N * 1e-6;
  287. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops fp16: %f ms "
  288. "%f Gflops speedup: %f\n",
  289. M, K, N, float_used, computations / float_used, fp16_used,
  290. computations / fp16_used, float_used / fp16_used);
  291. };
  292. run(256, 12 * 24, 256);
  293. for (size_t M : {8, 64, 112, 256}) {
  294. for (size_t K : {8, 64, 112, 256}) {
  295. for (size_t N : {8, 64, 112, 256}) {
  296. run(M, N, K);
  297. }
  298. }
  299. }
  300. }
  301. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_F16_MK8) {
  302. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(4);
  303. matrix_mul::benchmark_with_contrast(
  304. handle(), args, dtype::Float16{}, dtype::Float16{},
  305. dtype::Float16{}, "AARCH32_F16_MK8_4X8",
  306. param::MatrixMul::Format::MK8, dtype::Float16{}, dtype::Float16{},
  307. dtype::Float16{}, "AARCH32_F16_K4X16X1");
  308. }
  309. #endif
  310. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_MK4) {
  311. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  312. matrix_mul::benchmark_with_contrast(
  313. handle(), args, dtype::Float32{}, dtype::Float32{},
  314. dtype::Float32{}, "ARMV7_F32_MK4_4x8",
  315. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  316. dtype::Float32{});
  317. }
  318. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_PACK_MK4) {
  319. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(8);
  320. matrix_mul::benchmark_with_contrast(
  321. handle(), args, dtype::Float32{}, dtype::Float32{},
  322. dtype::Float32{}, "ARMV7_F32_MK4_PACK_4X12",
  323. param::MatrixMul::Format::MK4, dtype::Float32{}, dtype::Float32{},
  324. dtype::Float32{});
  325. }
  326. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT16x16x32_MK8) {
  327. auto args = matrix_mul::get_benchmark_matmul_mk_packed_args(4);
  328. matrix_mul::benchmark_with_contrast(
  329. handle(), args, dtype::Int16{}, dtype::Int16{}, dtype::Int32{},
  330. "ARMV7_INT16X16X32_MK8_4X8", param::MatrixMul::Format::MK8,
  331. dtype::Int16{}, dtype::Int16{}, dtype::Int32{});
  332. }
  333. TEST_F(ARMV7, BENCHMARK_MATRIX_MUL_INT32_MK_4X2X16) {
  334. constexpr size_t RUNS = 50;
  335. param::MatrixMul param;
  336. param.transposeA = false;
  337. param.transposeB = false;
  338. Benchmarker<MatrixMul> benchmarker(handle());
  339. Benchmarker<MatrixMul> benchmarker_mk4(handle());
  340. benchmarker.set_times(RUNS)
  341. .set_dtype(0, dtype::Int8{})
  342. .set_dtype(1, dtype::Int8{})
  343. .set_dtype(2, dtype::Int32{})
  344. .set_param(param)
  345. .set_display(false);
  346. benchmarker.set_before_exec_callback(
  347. AlgoChecker<MatrixMul>("ARMV7_INT8X8X32_K4X2X16"));
  348. param.format = MatrixMul::Param::Format::MK4;
  349. benchmarker_mk4.set_before_exec_callback(
  350. AlgoChecker<MatrixMul>("ARMV7_INT8X8X32_MK4_4X2X16"));
  351. benchmarker_mk4.set_times(RUNS)
  352. .set_dtype(0, dtype::Int8{})
  353. .set_dtype(1, dtype::Int8{})
  354. .set_dtype(2, dtype::Int32{})
  355. .set_param(param)
  356. .set_display(false);
  357. auto run = [&](size_t M, size_t N, size_t K) {
  358. auto mk_used = benchmarker_mk4.exec(
  359. {{M / 4, K / 4, 4, 4}, {K / 4, N, 4}, {}}) /
  360. RUNS;
  361. auto default_used = benchmarker.exec({{M, K}, {K, N}, {}}) / RUNS;
  362. float computations = 2.f * M * K * N * 1e-6;
  363. printf("run: {%zu{M} %zu{K} %zu{N}} normal: %f ms %f Gflops mk4: %f ms "
  364. "%f Gflops speedup_vs_normal: %f\n",
  365. M, K, N, default_used, computations / default_used, mk_used,
  366. computations / mk_used, default_used / mk_used);
  367. };
  368. run(256, 256, 128);
  369. for (size_t k = 4; k <= 512; k *= 2) {
  370. for (size_t m = 4; m <= 512; m *= 2) {
  371. for (size_t n = 4; n <= 512; n *= 2) {
  372. run(m, n, k);
  373. }
  374. }
  375. std::cout << std::endl;
  376. }
  377. }
  378. #endif
  379. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台