You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /**
  2. * \file dnn/test/arm_common/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/arm_common/fixture.h"
  12. #include "test/common/benchmarker.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/matrix_mul.h"
  15. #include "test/common/rng.h"
  16. using namespace megdnn;
  17. using namespace test;
  18. TEST_F(ARM_COMMON, MATRIX_MUL_INT8x8x32) {
  19. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  20. handle());
  21. }
  22. TEST_F(ARM_COMMON, MATRIX_MUL_INT8x8x16) {
  23. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  24. handle());
  25. }
  26. TEST_F(ARM_COMMON, MATRIX_MUL_QUINT8) {
  27. matrix_mul::check_matrix_mul(dtype::Quantized8Asymm(1.2f, (uint8_t)127),
  28. dtype::Quantized8Asymm(1.3f, (uint8_t)129),
  29. {},
  30. handle());
  31. }
  32. TEST_F(ARM_COMMON, MATRIX_MUL_FP32) {
  33. Checker<MatrixMul> checker(handle());
  34. using Param = MatrixMul::Param;
  35. auto run = [&](size_t M, size_t K, size_t N) {
  36. Param param;
  37. param.transposeA = false;
  38. param.transposeB = false;
  39. TensorShape A, B;
  40. A = TensorShape{M, K};
  41. B = TensorShape{K, N};
  42. checker.set_param(param)
  43. .set_dtype(0, dtype::Float32())
  44. .set_dtype(1, dtype::Float32())
  45. .set_dtype(2, dtype::Float32())
  46. .execs({A, B, {}});
  47. };
  48. checker.set_before_exec_callback(
  49. AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  50. // M < 8
  51. for (size_t M : {1, 2, 3, 4, 5, 6, 7})
  52. for (size_t K : {7, 1024, 2048})
  53. for (size_t N : {7, 1024, 2056})
  54. run(M, K, N);
  55. // M = 8,K = 1, 2
  56. for (size_t M : {8})
  57. for (size_t K : {1, 2})
  58. for (size_t N : {7, 1024, 2056})
  59. run(M, K, N);
  60. // N = 1
  61. for (size_t M : {1, 2, 3, 4, 5, 6, 7})
  62. for (size_t K : {7, 1024, 2048})
  63. for (size_t N : {1})
  64. run(M, K, N);
  65. }
  66. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  67. TEST_F(ARM_COMMON, MATRIX_MUL_FP16) {
  68. Checker<MatrixMul> checker(handle());
  69. checker.set_epsilon(1e-2);
  70. NormalRNG rng(2.f);
  71. checker.set_rng(0, &rng).set_rng(1, &rng);
  72. using Param = MatrixMul::Param;
  73. auto args = matrix_mul::get_matmul_args_no_mask();
  74. for (auto& arg : args) {
  75. size_t m = arg.m, n = arg.n, k = arg.k;
  76. Param param;
  77. param.transposeA = false;
  78. param.transposeB = false;
  79. TensorShape A, B;
  80. A = TensorShape{m, k};
  81. B = TensorShape{k, n};
  82. checker.set_param(param)
  83. .set_dtype(0, dtype::Float16())
  84. .set_dtype(1, dtype::Float16())
  85. .set_dtype(2, dtype::Float16())
  86. .execs({A, B, {}});
  87. }
  88. }
  89. TEST_F(ARM_COMMON, MATRIX_MUL_FP16_TEST) {
  90. Checker<MatrixMul> checker(handle());
  91. using Param = MatrixMul::Param;
  92. checker.set_epsilon(1e-2);
  93. NormalRNG rng(2.f);
  94. checker.set_rng(0, &rng).set_rng(1, &rng);
  95. auto run = [&](size_t M, size_t K, size_t N) {
  96. Param param;
  97. param.transposeA = false;
  98. param.transposeB = false;
  99. TensorShape A, B;
  100. A = TensorShape{M, K};
  101. B = TensorShape{K, N};
  102. checker.set_param(param)
  103. .set_dtype(0, dtype::Float16())
  104. .set_dtype(1, dtype::Float16())
  105. .set_dtype(2, dtype::Float16())
  106. .execs({A, B, {}});
  107. };
  108. checker.set_before_exec_callback(
  109. AlgoChecker<MatrixMul>("ARM_COMMON_F16_GEMV"));
  110. // M = 1, 2, 3, 4
  111. for (size_t M : {1, 2, 3, 4})
  112. for (size_t K : {7, 512, 1024})
  113. for (size_t N : {13, 1024, 2048})
  114. run(M, K, N);
  115. // N = 1
  116. for (size_t M : {1, 2, 3, 4})
  117. for (size_t K : {7, 512, 1024})
  118. for (size_t N : {1})
  119. run(M, K, N);
  120. }
  121. #endif
  122. #if MEGDNN_WITH_BENCHMARK
  123. TEST_F(ARM_COMMON, BENCHMARK_SGEMV) {
  124. int exec_times = 10;
  125. Benchmarker<MatrixMul> benchmarker(handle());
  126. benchmarker.set_times(exec_times);
  127. auto run = [&](size_t M, size_t K, size_t N) {
  128. std::cout << "SGEMV: (" << M << ", " << K << ", " << N << ")"
  129. << std::endl;
  130. benchmarker.set_dtype(0, dtype::Float32())
  131. .set_dtype(1, dtype::Float32());
  132. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  133. auto computations = 2.f * M * K * N * 1e-6;
  134. auto perf = computations / time;
  135. std::cout << "gemv fp32, Performance is " << perf << " Gflops"
  136. << std::endl;
  137. };
  138. std::cout << "warm up:\n";
  139. for (int i = 0; i < 50; i++) {
  140. benchmarker.set_dtype(0, dtype::Float32())
  141. .set_dtype(1, dtype::Float32())
  142. .set_display(false)
  143. .exec({{2, 1024}, {1024, 512}, {}});
  144. benchmarker.set_display(true);
  145. }
  146. // run gemv
  147. for (size_t M : {1, 2, 4, 8})
  148. for (size_t K : {1024, 1536, 2048})
  149. for (size_t N : {512, 1024})
  150. run(M, K, N);
  151. }
  152. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP16) {
  153. int exec_times = 50;
  154. Benchmarker<MatrixMul> benchmarker(handle());
  155. benchmarker.set_times(exec_times);
  156. benchmarker.set_before_exec_callback(
  157. AlgoChecker<MatrixMul>("ARM_COMMON_F16_GEMV"));
  158. auto run = [&](size_t M, size_t K, size_t N) {
  159. std::cout << "SGEMV: (" << M << ", " << K << ", " << N << ")"
  160. << std::endl;
  161. benchmarker.set_dtype(0, dtype::Float16())
  162. .set_dtype(1, dtype::Float16())
  163. .set_dtype(2, dtype::Float16());
  164. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  165. auto computations = 2 * M * K * N * 1e-6;
  166. auto perf = computations / time;
  167. std::cout << "gemv fp16, Performance is " << perf << " Gflops"
  168. << std::endl;
  169. };
  170. std::cout << "warm up:\n";
  171. for (int i = 0; i < 50; i++) {
  172. benchmarker.set_dtype(0, dtype::Float16())
  173. .set_dtype(1, dtype::Float16())
  174. .set_dtype(2, dtype::Float16())
  175. .set_display(false)
  176. .exec({{2, 1024}, {1024, 512}, {}});
  177. benchmarker.set_display(true);
  178. }
  179. // run gemv
  180. for (size_t M : {1, 2, 3, 4})
  181. for (size_t K : {1024, 1536, 2048})
  182. for (size_t N : {512, 1024})
  183. run(M, K, N);
  184. }
  185. TEST_F(ARM_COMMON, BENCHMARK_SGEMM) {
  186. int exec_times = 10;
  187. Benchmarker<MatrixMul> benchmarker(handle());
  188. benchmarker.set_times(exec_times);
  189. float mod = 1000 * exec_times / 1e9;
  190. auto run = [&](size_t M, size_t K, size_t N) {
  191. float time = 1.f, perf = 1.f;
  192. std::cout << "SGEMM: (" << M << ", " << K << ", " << N << ")"
  193. << std::endl;
  194. benchmarker.set_dtype(0, dtype::Float32())
  195. .set_dtype(1, dtype::Float32());
  196. time = benchmarker.exec({{M, K}, {K, N}, {}});
  197. perf = 2.f * M * K * N / time * mod;
  198. std::cout << "gemm fp32, Performance is " << perf << " Gflops"
  199. << std::endl;
  200. };
  201. std::cout << "warm up:\n";
  202. for (int i = 0; i < 50; i++) {
  203. benchmarker.set_dtype(0, dtype::Float32())
  204. .set_dtype(1, dtype::Float32())
  205. .set_display(false)
  206. .exec({{2, 1024}, {1024, 512}, {}});
  207. benchmarker.set_display(true);
  208. }
  209. run(256, 12 * 24, 256);
  210. //////////////////////// gemv //////////////////////////
  211. for (size_t M : {8, 64, 112, 256}) {
  212. for (size_t K : {8, 64, 112, 256}) {
  213. run (M, 1, K);
  214. }
  215. }
  216. //////////////////////// gemm //////////////////////////
  217. for (size_t M : {8, 64, 112, 256}) {
  218. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  219. for (size_t N : {8, 64, 112, 256}) {
  220. run(M, N, K);
  221. }
  222. }
  223. }
  224. }
  225. TEST_F(ARM_COMMON, BENCHMARK_MATRIX_MUL_INT8x8x32) {
  226. constexpr size_t RUNS = 50;
  227. param::MatrixMul param;
  228. Benchmarker<MatrixMul> benchmarker_int(handle());
  229. benchmarker_int.set_times(RUNS)
  230. .set_dtype(0, dtype::Int8{})
  231. .set_dtype(1, dtype::Int8{})
  232. .set_dtype(2, dtype::Int32{})
  233. .set_param(param).set_display(false);
  234. Benchmarker<MatrixMul> benchmarker_float(handle());
  235. benchmarker_float.set_display(false).set_times(RUNS);
  236. auto run = [&](size_t M, size_t N, size_t K) {
  237. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  238. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  239. float computations = 2.f * M * K * N * 1e-6;
  240. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  241. "%f Gflops speedup: %f\n",
  242. M, K, N, float_used, computations / float_used, int_used,
  243. computations / int_used, float_used / int_used);
  244. };
  245. run(256, 12 * 24, 256);
  246. //////////////////////// gemv //////////////////////////
  247. for (size_t M : {8, 64, 112, 256}) {
  248. for (size_t K : {8, 64, 112, 256}) {
  249. run (M, 1, K);
  250. }
  251. }
  252. //////////////////////// gemm //////////////////////////
  253. for (size_t M : {8, 64, 112, 256}) {
  254. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  255. for (size_t N : {8, 64, 112, 256}) {
  256. run(M, N, K);
  257. }
  258. }
  259. }
  260. }
  261. TEST_F(ARM_COMMON, BENCHMARK_MATRIX_MUL_QUINT8) {
  262. constexpr size_t RUNS = 50;
  263. param::MatrixMul param;
  264. Benchmarker<MatrixMul> benchmarker_int(handle());
  265. benchmarker_int.set_times(RUNS)
  266. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  267. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  268. .set_dtype(2, {})
  269. .set_param(param)
  270. .set_display(false);
  271. Benchmarker<MatrixMul> benchmarker_float(handle());
  272. benchmarker_float.set_display(false).set_times(RUNS);
  273. auto run = [&](size_t M, size_t N, size_t K) {
  274. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  275. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  276. float computations = 2.f * M * K * N * 1e-6;
  277. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  278. "%f Gflops speedup: %f\n",
  279. M, K, N, float_used, computations / float_used, int_used,
  280. computations / int_used, float_used / int_used);
  281. };
  282. run(256, 12 * 24, 256);
  283. for (size_t M : {8, 64, 112, 256}) {
  284. for (size_t K : {8, 64, 112, 256}) {
  285. for (size_t N : {8, 64, 112, 256}) {
  286. run(M, N, K);
  287. }
  288. }
  289. }
  290. }
  291. TEST_F(ARM_COMMON, BENCHMARK_TRANSPOSED_MATRIX_MUL_QUINT8) {
  292. constexpr size_t RUNS = 50;
  293. param::MatrixMul param;
  294. param.transposeA = param.transposeB = true;
  295. Benchmarker<MatrixMul> benchmarker_int(handle());
  296. benchmarker_int.set_times(RUNS)
  297. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  298. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  299. .set_dtype(2, {})
  300. .set_param(param)
  301. .set_display(false);
  302. Benchmarker<MatrixMul> benchmarker_float(handle());
  303. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  304. auto run = [&](size_t M, size_t N, size_t K) {
  305. auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
  306. auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
  307. float computations = 2.f * M * K * N * 1e-6;
  308. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  309. "%f Gflops speedup: %f\n",
  310. M, K, N, float_used, computations / float_used, int_used,
  311. computations / int_used, float_used / int_used);
  312. };
  313. run(256, 12 * 24, 256);
  314. for (size_t M : {8, 64, 112, 256}) {
  315. for (size_t K : {8, 64, 112, 256}) {
  316. for (size_t N : {8, 64, 112, 256}) {
  317. run(M, N, K);
  318. }
  319. }
  320. }
  321. }
  322. #endif
  323. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台