You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. /**
  2. * \file dnn/test/arm_common/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/arm_common/fixture.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/matrix_mul.h"
  16. #include "test/common/rng.h"
  17. #if MGB_ENABLE_CPUINFO
  18. #include "cpuinfo.h"
  19. #endif
  20. using namespace megdnn;
  21. using namespace test;
  22. TEST_F(ARM_COMMON, MATRIX_MUL_INT8x8x32) {
  23. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  24. handle());
  25. }
  26. TEST_F(ARM_COMMON, MATRIX_MUL_INT8x8x16) {
  27. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  28. handle());
  29. }
  30. TEST_F(ARM_COMMON, MATRIX_MUL_QUINT8) {
  31. matrix_mul::check_matrix_mul(dtype::Quantized8Asymm(1.2f, (uint8_t)127),
  32. dtype::Quantized8Asymm(1.3f, (uint8_t)129), {},
  33. handle());
  34. }
  35. TEST_F(ARM_COMMON, MATRIX_MUL_FP32) {
  36. Checker<MatrixMul> checker(handle());
  37. using Param = MatrixMul::Param;
  38. auto run = [&](size_t M, size_t K, size_t N) {
  39. Param param;
  40. param.transposeA = false;
  41. param.transposeB = false;
  42. TensorShape A, B;
  43. A = TensorShape{M, K};
  44. B = TensorShape{K, N};
  45. checker.set_param(param)
  46. .set_dtype(0, dtype::Float32())
  47. .set_dtype(1, dtype::Float32())
  48. .set_dtype(2, dtype::Float32())
  49. .execs({A, B, {}});
  50. };
  51. checker.set_before_exec_callback(
  52. AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  53. // M < 8
  54. for (size_t M : {1, 2, 3, 4, 5, 6, 7})
  55. for (size_t K : {7, 1024, 2048})
  56. for (size_t N : {7, 1024, 2056})
  57. run(M, K, N);
  58. // M = 8,K = 1, 2
  59. for (size_t M : {8})
  60. for (size_t K : {1, 2})
  61. for (size_t N : {7, 1024, 2056})
  62. run(M, K, N);
  63. // N = 1
  64. for (size_t M : {1, 2, 3, 4, 5, 6, 7})
  65. for (size_t K : {7, 1024, 2048})
  66. for (size_t N : {1})
  67. run(M, K, N);
  68. }
  69. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  70. TEST_F(ARM_COMMON, MATRIX_MUL_FP16) {
  71. Checker<MatrixMul> checker(handle());
  72. checker.set_epsilon(1e-2);
  73. NormalRNG rng(2.f);
  74. checker.set_rng(0, &rng).set_rng(1, &rng);
  75. using Param = MatrixMul::Param;
  76. auto args = matrix_mul::get_matmul_args_no_mask();
  77. for (auto& arg : args) {
  78. size_t m = arg.m, n = arg.n, k = arg.k;
  79. Param param;
  80. param.transposeA = false;
  81. param.transposeB = false;
  82. TensorShape A, B;
  83. A = TensorShape{m, k};
  84. B = TensorShape{k, n};
  85. checker.set_param(param)
  86. .set_dtype(0, dtype::Float16())
  87. .set_dtype(1, dtype::Float16())
  88. .set_dtype(2, dtype::Float16())
  89. .execs({A, B, {}});
  90. }
  91. }
  92. TEST_F(ARM_COMMON, MATRIX_MUL_FP16_TEST) {
  93. Checker<MatrixMul> checker(handle());
  94. using Param = MatrixMul::Param;
  95. checker.set_epsilon(1e-2);
  96. NormalRNG rng(2.f);
  97. checker.set_rng(0, &rng).set_rng(1, &rng);
  98. auto run = [&](size_t M, size_t K, size_t N) {
  99. Param param;
  100. param.transposeA = false;
  101. param.transposeB = false;
  102. TensorShape A, B;
  103. A = TensorShape{M, K};
  104. B = TensorShape{K, N};
  105. checker.set_param(param)
  106. .set_dtype(0, dtype::Float16())
  107. .set_dtype(1, dtype::Float16())
  108. .set_dtype(2, dtype::Float16())
  109. .execs({A, B, {}});
  110. };
  111. checker.set_before_exec_callback(
  112. AlgoChecker<MatrixMul>("ARM_COMMON_F16_GEMV"));
  113. // M = 1, 2, 3, 4
  114. for (size_t M : {1, 2, 3, 4})
  115. for (size_t K : {7, 512, 1024})
  116. for (size_t N : {13, 1024, 2048})
  117. run(M, K, N);
  118. // N = 1
  119. for (size_t M : {1, 2, 3, 4})
  120. for (size_t K : {7, 512, 1024})
  121. for (size_t N : {1})
  122. run(M, K, N);
  123. }
  124. #endif
  125. TEST_F(ARM_COMMON, QINT8x8x32_GEMV) {
  126. Checker<MatrixMul> checker(handle());
  127. using Param = MatrixMul::Param;
  128. checker.set_before_exec_callback(
  129. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV"));
  130. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  131. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  132. auto run = [&](size_t M, size_t K, size_t N) {
  133. Param param;
  134. param.transposeA = false;
  135. param.transposeB = false;
  136. TensorShape A, B;
  137. A = TensorShape{M, K};
  138. B = TensorShape{K, N};
  139. checker.set_param(param)
  140. .set_dtype(0, dtype::QuantizedS8(2.5f))
  141. .set_dtype(1, dtype::QuantizedS8(2.5f))
  142. .set_dtype(2, dtype::QuantizedS32(6.25f))
  143. .execs({A, B, {}});
  144. };
  145. // N = 1
  146. for (size_t M : {1, 10, 16, 33, 64})
  147. for (size_t K : {7, 512, 1024})
  148. for (size_t N : {1})
  149. run(M, K, N);
  150. }
  151. TEST_F(ARM_COMMON, QINT8x8x32_GEMV_MK4) {
  152. Checker<MatrixMul> checker(handle());
  153. using Param = MatrixMul::Param;
  154. checker.set_before_exec_callback(
  155. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV_MK4"));
  156. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  157. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  158. auto run = [&](size_t M, size_t K, size_t N) {
  159. MEGDNN_MARK_USED_VAR(N);
  160. Param param;
  161. param.format = param::MatrixMul::Format::MK4;
  162. param.transposeA = false;
  163. param.transposeB = false;
  164. TensorShape A, B;
  165. A = TensorShape{M / 4, K / 4, 4, 4};
  166. B = TensorShape{K / 4, 1, 4};
  167. checker.set_param(param)
  168. .set_dtype(0, dtype::QuantizedS8(2.5f))
  169. .set_dtype(1, dtype::QuantizedS8(2.5f))
  170. .set_dtype(2, dtype::QuantizedS32(6.25f))
  171. .execs({A, B, {}});
  172. };
  173. // N = 1
  174. for (size_t M : {4, 16, 128, 1024})
  175. for (size_t K : {4, 8, 12, 16, 20, 24, 256, 1024})
  176. run(M, K, 1);
  177. }
  178. #if MGB_ENABLE_DOT
  179. TEST_F(ARM_COMMON, QINT8x8x32_GEMV_MK4_DOT) {
  180. Checker<MatrixMul> checker(handle());
  181. using Param = MatrixMul::Param;
  182. checker.set_before_exec_callback(
  183. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV_MK4_DOT"));
  184. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  185. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  186. auto run = [&](size_t M, size_t K, size_t N) {
  187. Param param;
  188. param.format = param::MatrixMul::Format::MK4_DOT;
  189. param.transposeA = false;
  190. param.transposeB = false;
  191. TensorShape A, B;
  192. A = TensorShape{M / 4, K / 4, 4, 4};
  193. B = TensorShape{K / 4, 1, 4};
  194. checker.set_param(param)
  195. .set_dtype(0, dtype::QuantizedS8(2.5f))
  196. .set_dtype(1, dtype::QuantizedS8(2.5f))
  197. .set_dtype(2, dtype::QuantizedS32(6.25f))
  198. .execs({A, B, {}});
  199. };
  200. // N = 1
  201. for (size_t M : {4, 16, 128, 1024})
  202. for (size_t K : {4, 8, 12, 16, 20, 24, 256, 1024})
  203. run(M, K, 1);
  204. }
  205. #endif
  206. TEST_F(ARM_COMMON, QINT8x8x32_GEVM) {
  207. Checker<MatrixMul> checker(handle());
  208. using Param = MatrixMul::Param;
  209. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_GEVM"));
  210. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  211. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  212. auto run = [&](size_t M, size_t K, size_t N) {
  213. Param param;
  214. param.transposeA = false;
  215. param.transposeB = true;
  216. TensorShape A, B;
  217. A = TensorShape{M, K};
  218. B = TensorShape{N, K};
  219. checker.set_param(param)
  220. .set_dtype(0, dtype::QuantizedS8(2.5f))
  221. .set_dtype(1, dtype::QuantizedS8(2.5f))
  222. .set_dtype(2, dtype::QuantizedS32(6.25f))
  223. .execs({A, B, {}});
  224. };
  225. // M = 1
  226. for (size_t N : {1, 10, 16, 33, 64})
  227. for (size_t K : {7, 512, 1024})
  228. for (size_t M : {1})
  229. run(M, K, N);
  230. }
  231. TEST_F(ARM_COMMON, FP32_GEVM) {
  232. Checker<MatrixMul> checker(handle());
  233. using Param = MatrixMul::Param;
  234. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_GEVM"));
  235. checker.set_epsilon(1e-2);
  236. auto run = [&](size_t M, size_t K, size_t N) {
  237. Param param;
  238. param.transposeA = false;
  239. param.transposeB = true;
  240. TensorShape A, B;
  241. A = TensorShape{M, K};
  242. B = TensorShape{N, K};
  243. checker.set_param(param).execs({A, B, {}});
  244. };
  245. // M = 1
  246. for (size_t M : {1})
  247. for (size_t K : {1000, 4096})
  248. for (size_t N : {1000, 4096})
  249. run(M, K, N);
  250. }
  251. TEST_F(ARM_COMMON, FP32_GEMV_MK4) {
  252. Checker<MatrixMul> checker(handle());
  253. using Param = MatrixMul::Param;
  254. checker.set_before_exec_callback(
  255. AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV_MK4"));
  256. checker.set_epsilon(1e-2);
  257. auto run = [&](size_t M, size_t K) {
  258. Param param;
  259. param.format = param::MatrixMul::Format::MK4;
  260. param.transposeA = false;
  261. param.transposeB = false;
  262. TensorShape A, B;
  263. A = TensorShape{M / 4, K / 4, 4, 4};
  264. B = TensorShape{K / 4, 1, 4};
  265. checker.set_param(param).execs({A, B, {}});
  266. };
  267. // N = 1
  268. for (size_t M : {4, 16, 128, 1024})
  269. for (size_t K : {4, 8, 12, 128, 256, 4096})
  270. run(M, K);
  271. }
  272. #if MEGDNN_WITH_BENCHMARK
  273. TEST_F(ARM_COMMON, BENCHMARK_SGEMV) {
  274. int exec_times = 10;
  275. Benchmarker<MatrixMul> benchmarker(handle());
  276. benchmarker.set_times(exec_times);
  277. auto run = [&](size_t M, size_t K, size_t N) {
  278. printf("SGEMV: (%zu, %zu, %zu)\n", M, K, N);
  279. benchmarker.set_dtype(0, dtype::Float32())
  280. .set_dtype(1, dtype::Float32());
  281. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  282. auto computations = 2.f * M * K * N * 1e-6;
  283. auto perf = computations / time;
  284. printf("gemv fp32, Performance is %f Gflops\n", perf);
  285. };
  286. printf("warm up:\n");
  287. for (int i = 0; i < 50; i++) {
  288. benchmarker.set_dtype(0, dtype::Float32())
  289. .set_dtype(1, dtype::Float32())
  290. .set_display(false)
  291. .exec({{2, 1024}, {1024, 512}, {}});
  292. benchmarker.set_display(true);
  293. }
  294. // run gemv
  295. for (size_t M : {1, 2, 4, 8})
  296. for (size_t K : {1024, 1536, 2048})
  297. for (size_t N : {512, 1024})
  298. run(M, K, N);
  299. for (size_t M : {4, 64, 1024, 4096})
  300. for (size_t K : {128, 256, 1024, 4096})
  301. run(M, K, 1);
  302. }
  303. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP32) {
  304. int exec_times = 50;
  305. Benchmarker<MatrixMul> benchmarker(handle());
  306. benchmarker.set_times(exec_times);
  307. benchmarker.set_before_exec_callback(
  308. AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  309. auto run = [&](size_t M, size_t K, size_t N) {
  310. printf("SGEMV: (%zu, %zu, %zu)\n", M, K, N);
  311. benchmarker.set_dtype(0, dtype::Float32())
  312. .set_dtype(1, dtype::Float32())
  313. .set_dtype(2, dtype::Float32());
  314. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  315. auto computations = 2 * M * K * N * 1e-6;
  316. auto perf = computations / time;
  317. printf("gemv fp32, Performance is %f Gflops\n", perf);
  318. };
  319. printf("warm up:\n");
  320. for (int i = 0; i < 50; i++) {
  321. benchmarker.set_dtype(0, dtype::Float32())
  322. .set_dtype(1, dtype::Float32())
  323. .set_display(false)
  324. .exec({{2, 1024}, {1024, 512}, {}});
  325. benchmarker.set_display(true);
  326. }
  327. // run gemv
  328. run(12, 48, 1);
  329. run(48, 12, 1);
  330. run(32, 128, 1);
  331. run(128, 32, 1);
  332. run(64, 256, 1);
  333. run(256, 64, 1);
  334. run(128, 512, 1);
  335. run(512, 128, 1);
  336. run(256, 1024, 1);
  337. run(1024, 256, 1);
  338. }
  339. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_MK4) {
  340. int exec_times = 10;
  341. using Param = MatrixMul::Param;
  342. Param param;
  343. param.format = param::MatrixMul::Format::MK4;
  344. param.transposeA = false;
  345. param.transposeB = false;
  346. Benchmarker<MatrixMul> benchmarker(handle());
  347. benchmarker.set_times(exec_times);
  348. benchmarker.set_dtype(0, dtype::Float32())
  349. .set_dtype(1, dtype::Float32())
  350. .set_param(param);
  351. auto run = [&](size_t M, size_t K) {
  352. printf("SGEMV_MK4: (%zu, %zu, 1)\n", M, K);
  353. TensorShape A, B;
  354. A = TensorShape{M / 4, K / 4, 4, 4};
  355. B = TensorShape{K / 4, 1, 4};
  356. auto time = benchmarker.exec({A, B, {}}) / exec_times;
  357. auto computations = 2.f * M * K * 1e-6;
  358. auto perf = computations / time;
  359. printf("gemv mk4 fp32, Performance is %f Gflops\n", perf);
  360. };
  361. printf("warm up:\n");
  362. for (int i = 0; i < 50; i++) {
  363. benchmarker.set_dtype(0, dtype::Float32())
  364. .set_dtype(1, dtype::Float32())
  365. .set_dtype(2, dtype::Float32())
  366. .set_display(false)
  367. .exec({{4, 256, 4, 4}, {256, 1, 4}, {}});
  368. }
  369. // run gemv mk4
  370. for (size_t M : {4, 64, 1024, 4096})
  371. for (size_t K : {128, 1024, 4096})
  372. run(M, K);
  373. }
  374. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP16) {
  375. int exec_times = 50;
  376. Benchmarker<MatrixMul> benchmarker(handle());
  377. benchmarker.set_times(exec_times);
  378. benchmarker.set_before_exec_callback(
  379. AlgoChecker<MatrixMul>("ARM_COMMON_F16_GEMV"));
  380. auto run = [&](size_t M, size_t K, size_t N) {
  381. printf("SGEMV_FP16: (%zu, %zu, %zu)\n", M, K, N);
  382. benchmarker.set_dtype(0, dtype::Float16())
  383. .set_dtype(1, dtype::Float16())
  384. .set_dtype(2, dtype::Float16());
  385. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  386. auto computations = 2 * M * K * N * 1e-6;
  387. auto perf = computations / time;
  388. printf("gemv fp16, Performance is %f Gflops\n", perf);
  389. };
  390. printf("warm up:\n");
  391. for (int i = 0; i < 50; i++) {
  392. benchmarker.set_dtype(0, dtype::Float16())
  393. .set_dtype(1, dtype::Float16())
  394. .set_dtype(2, dtype::Float16())
  395. .set_display(false)
  396. .exec({{2, 1024}, {1024, 512}, {}});
  397. benchmarker.set_display(true);
  398. }
  399. // run gemv
  400. for (size_t M : {1, 2, 3, 4})
  401. for (size_t K : {1024, 1536, 2048})
  402. for (size_t N : {512, 1024})
  403. run(M, K, N);
  404. }
  405. TEST_F(ARM_COMMON, BENCHMARK_SGEMM) {
  406. int exec_times = 10;
  407. Benchmarker<MatrixMul> benchmarker(handle());
  408. benchmarker.set_times(exec_times);
  409. float mod = 1000 * exec_times / 1e9;
  410. auto run = [&](size_t M, size_t K, size_t N) {
  411. float time = 1.f, perf = 1.f;
  412. printf("SGEMM: (%zu, %zu, %zu)\n", M, K, N);
  413. benchmarker.set_dtype(0, dtype::Float32())
  414. .set_dtype(1, dtype::Float32());
  415. time = benchmarker.exec({{M, K}, {K, N}, {}});
  416. perf = 2.f * M * K * N / time * mod;
  417. printf("gemm, Performance is %f Gflops\n", perf);
  418. };
  419. printf("warm up:\n");
  420. for (int i = 0; i < 50; i++) {
  421. benchmarker.set_dtype(0, dtype::Float32())
  422. .set_dtype(1, dtype::Float32())
  423. .set_display(false)
  424. .exec({{2, 1024}, {1024, 512}, {}});
  425. benchmarker.set_display(true);
  426. }
  427. run(256, 12 * 24, 256);
  428. //////////////////////// gemv //////////////////////////
  429. for (size_t M : {8, 64, 112, 256}) {
  430. for (size_t K : {8, 64, 112, 256}) {
  431. run(M, 1, K);
  432. }
  433. }
  434. //////////////////////// gemm //////////////////////////
  435. for (size_t M : {8, 64, 112, 256}) {
  436. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  437. for (size_t N : {8, 64, 112, 256}) {
  438. run(M, N, K);
  439. }
  440. }
  441. }
  442. }
  443. TEST_F(ARM_COMMON, BENCHMARK_MATRIX_MUL_INT8x8x32) {
  444. constexpr size_t RUNS = 50;
  445. param::MatrixMul param;
  446. Benchmarker<MatrixMul> benchmarker_int(handle());
  447. benchmarker_int.set_times(RUNS)
  448. .set_dtype(0, dtype::Int8{})
  449. .set_dtype(1, dtype::Int8{})
  450. .set_dtype(2, dtype::Int32{})
  451. .set_param(param)
  452. .set_display(false);
  453. Benchmarker<MatrixMul> benchmarker_float(handle());
  454. benchmarker_float.set_display(false).set_times(RUNS);
  455. auto run = [&](size_t M, size_t N, size_t K) {
  456. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  457. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  458. float computations = 2.f * M * K * N * 1e-6;
  459. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  460. "%f Gflops speedup: %f\n",
  461. M, K, N, float_used, computations / float_used, int_used,
  462. computations / int_used, float_used / int_used);
  463. };
  464. run(256, 12 * 24, 256);
  465. //////////////////////// gemv //////////////////////////
  466. for (size_t M : {8, 64, 112, 256}) {
  467. for (size_t K : {8, 64, 112, 256}) {
  468. run(M, 1, K);
  469. }
  470. }
  471. //////////////////////// gemm //////////////////////////
  472. for (size_t M : {8, 64, 112, 256}) {
  473. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  474. for (size_t N : {8, 64, 112, 256}) {
  475. run(M, N, K);
  476. }
  477. }
  478. }
  479. }
  480. TEST_F(ARM_COMMON, BENCHMARK_MATRIX_MUL_QUINT8) {
  481. constexpr size_t RUNS = 50;
  482. param::MatrixMul param;
  483. Benchmarker<MatrixMul> benchmarker_int(handle());
  484. benchmarker_int.set_times(RUNS)
  485. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  486. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  487. .set_dtype(2, {})
  488. .set_param(param)
  489. .set_display(false);
  490. Benchmarker<MatrixMul> benchmarker_float(handle());
  491. benchmarker_float.set_display(false).set_times(RUNS);
  492. auto run = [&](size_t M, size_t N, size_t K) {
  493. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  494. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  495. float computations = 2.f * M * K * N * 1e-6;
  496. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  497. "%f Gflops speedup: %f\n",
  498. M, K, N, float_used, computations / float_used, int_used,
  499. computations / int_used, float_used / int_used);
  500. };
  501. run(256, 12 * 24, 256);
  502. for (size_t M : {8, 64, 112, 256}) {
  503. for (size_t K : {8, 64, 112, 256}) {
  504. for (size_t N : {8, 64, 112, 256}) {
  505. run(M, N, K);
  506. }
  507. }
  508. }
  509. }
  510. TEST_F(ARM_COMMON, BENCHMARK_TRANSPOSED_MATRIX_MUL_QUINT8) {
  511. constexpr size_t RUNS = 50;
  512. param::MatrixMul param;
  513. param.transposeA = param.transposeB = true;
  514. Benchmarker<MatrixMul> benchmarker_int(handle());
  515. benchmarker_int.set_times(RUNS)
  516. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  517. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  518. .set_dtype(2, {})
  519. .set_param(param)
  520. .set_display(false);
  521. Benchmarker<MatrixMul> benchmarker_float(handle());
  522. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  523. auto run = [&](size_t M, size_t N, size_t K) {
  524. auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
  525. auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
  526. float computations = 2.f * M * K * N * 1e-6;
  527. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  528. "%f Gflops speedup: %f\n",
  529. M, K, N, float_used, computations / float_used, int_used,
  530. computations / int_used, float_used / int_used);
  531. };
  532. run(256, 12 * 24, 256);
  533. for (size_t M : {8, 64, 112, 256}) {
  534. for (size_t K : {8, 64, 112, 256}) {
  535. for (size_t N : {8, 64, 112, 256}) {
  536. run(M, N, K);
  537. }
  538. }
  539. }
  540. }
  541. #endif
  542. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台