You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. /**
  2. * \file dnn/test/arm_common/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/arm_common/fixture.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/matrix_mul.h"
  16. #include "test/common/rng.h"
  17. using namespace megdnn;
  18. using namespace test;
  19. TEST_F(ARM_COMMON, MATRIX_MUL_INT8x8x32) {
  20. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int32{},
  21. handle());
  22. }
  23. TEST_F(ARM_COMMON, MATRIX_MUL_INT8x8x16) {
  24. matrix_mul::check_matrix_mul(dtype::Int8{}, dtype::Int8{}, dtype::Int16{},
  25. handle());
  26. }
  27. TEST_F(ARM_COMMON, MATRIX_MUL_QUINT8) {
  28. matrix_mul::check_matrix_mul(dtype::Quantized8Asymm(1.2f, (uint8_t)127),
  29. dtype::Quantized8Asymm(1.3f, (uint8_t)129), {},
  30. handle());
  31. }
  32. TEST_F(ARM_COMMON, MATRIX_MUL_FP32) {
  33. Checker<MatrixMul> checker(handle());
  34. using Param = MatrixMul::Param;
  35. auto run = [&](size_t M, size_t K, size_t N) {
  36. Param param;
  37. param.transposeA = false;
  38. param.transposeB = false;
  39. TensorShape A, B;
  40. A = TensorShape{M, K};
  41. B = TensorShape{K, N};
  42. checker.set_param(param)
  43. .set_dtype(0, dtype::Float32())
  44. .set_dtype(1, dtype::Float32())
  45. .set_dtype(2, dtype::Float32())
  46. .execs({A, B, {}});
  47. };
  48. checker.set_before_exec_callback(
  49. AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  50. // M < 8
  51. for (size_t M : {1, 2, 3, 4, 5, 6, 7})
  52. for (size_t K : {7, 1024, 2048})
  53. for (size_t N : {7, 1024, 2056})
  54. run(M, K, N);
  55. // M = 8,K = 1, 2
  56. for (size_t M : {8})
  57. for (size_t K : {1, 2})
  58. for (size_t N : {7, 1024, 2056})
  59. run(M, K, N);
  60. // N = 1
  61. for (size_t M : {1, 2, 3, 4, 5, 6, 7})
  62. for (size_t K : {7, 1024, 2048})
  63. for (size_t N : {1})
  64. run(M, K, N);
  65. }
  66. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  67. TEST_F(ARM_COMMON, MATRIX_MUL_FP16) {
  68. Checker<MatrixMul> checker(handle());
  69. checker.set_epsilon(1e-2);
  70. NormalRNG rng(2.f);
  71. checker.set_rng(0, &rng).set_rng(1, &rng);
  72. using Param = MatrixMul::Param;
  73. auto args = matrix_mul::get_matmul_args_no_mask();
  74. for (auto& arg : args) {
  75. size_t m = arg.m, n = arg.n, k = arg.k;
  76. Param param;
  77. param.transposeA = false;
  78. param.transposeB = false;
  79. TensorShape A, B;
  80. A = TensorShape{m, k};
  81. B = TensorShape{k, n};
  82. checker.set_param(param)
  83. .set_dtype(0, dtype::Float16())
  84. .set_dtype(1, dtype::Float16())
  85. .set_dtype(2, dtype::Float16())
  86. .execs({A, B, {}});
  87. }
  88. }
  89. TEST_F(ARM_COMMON, MATRIX_MUL_FP16_TEST) {
  90. Checker<MatrixMul> checker(handle());
  91. using Param = MatrixMul::Param;
  92. checker.set_epsilon(1e-2);
  93. NormalRNG rng(2.f);
  94. checker.set_rng(0, &rng).set_rng(1, &rng);
  95. auto run = [&](size_t M, size_t K, size_t N) {
  96. Param param;
  97. param.transposeA = false;
  98. param.transposeB = false;
  99. TensorShape A, B;
  100. A = TensorShape{M, K};
  101. B = TensorShape{K, N};
  102. checker.set_param(param)
  103. .set_dtype(0, dtype::Float16())
  104. .set_dtype(1, dtype::Float16())
  105. .set_dtype(2, dtype::Float16())
  106. .execs({A, B, {}});
  107. };
  108. checker.set_before_exec_callback(
  109. AlgoChecker<MatrixMul>("ARM_COMMON_F16_GEMV"));
  110. // M = 1, 2, 3, 4
  111. for (size_t M : {1, 2, 3, 4})
  112. for (size_t K : {7, 512, 1024})
  113. for (size_t N : {13, 1024, 2048})
  114. run(M, K, N);
  115. // N = 1
  116. for (size_t M : {1, 2, 3, 4})
  117. for (size_t K : {7, 512, 1024})
  118. for (size_t N : {1})
  119. run(M, K, N);
  120. }
  121. #endif
  122. TEST_F(ARM_COMMON, QINT8x8x32_GEMV) {
  123. Checker<MatrixMul> checker(handle());
  124. using Param = MatrixMul::Param;
  125. checker.set_before_exec_callback(
  126. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV"));
  127. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  128. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  129. auto run = [&](size_t M, size_t K, size_t N) {
  130. Param param;
  131. param.transposeA = false;
  132. param.transposeB = false;
  133. TensorShape A, B;
  134. A = TensorShape{M, K};
  135. B = TensorShape{K, N};
  136. checker.set_param(param)
  137. .set_dtype(0, dtype::QuantizedS8(2.5f))
  138. .set_dtype(1, dtype::QuantizedS8(2.5f))
  139. .set_dtype(2, dtype::QuantizedS32(6.25f))
  140. .execs({A, B, {}});
  141. };
  142. // N = 1
  143. for (size_t M : {1, 10, 16, 33, 64})
  144. for (size_t K : {7, 512, 1024})
  145. for (size_t N : {1})
  146. run(M, K, N);
  147. }
  148. TEST_F(ARM_COMMON, QINT8x8x32_GEMV_MK4) {
  149. Checker<MatrixMul> checker(handle());
  150. using Param = MatrixMul::Param;
  151. checker.set_before_exec_callback(
  152. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV_MK4"));
  153. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  154. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  155. auto run = [&](size_t M, size_t K, size_t N) {
  156. MEGDNN_MARK_USED_VAR(N);
  157. Param param;
  158. param.format = param::MatrixMul::Format::MK4;
  159. param.transposeA = false;
  160. param.transposeB = false;
  161. TensorShape A, B;
  162. A = TensorShape{M / 4, K / 4, 4, 4};
  163. B = TensorShape{K / 4, 1, 4};
  164. checker.set_param(param)
  165. .set_dtype(0, dtype::QuantizedS8(2.5f))
  166. .set_dtype(1, dtype::QuantizedS8(2.5f))
  167. .set_dtype(2, dtype::QuantizedS32(6.25f))
  168. .execs({A, B, {}});
  169. };
  170. // N = 1
  171. for (size_t M : {4, 16, 128, 1024})
  172. for (size_t K : {4, 8, 12, 16, 20, 24, 256, 1024})
  173. run(M, K, 1);
  174. }
  175. #if __ARM_FEATURE_DOTPROD
  176. TEST_F(ARM_COMMON, QINT8x8x32_GEMV_MK4_DOT) {
  177. Checker<MatrixMul> checker(handle());
  178. using Param = MatrixMul::Param;
  179. checker.set_before_exec_callback(
  180. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV_MK4_DOT"));
  181. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  182. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  183. auto run = [&](size_t M, size_t K, size_t N) {
  184. Param param;
  185. param.format = param::MatrixMul::Format::MK4_DOT;
  186. param.transposeA = false;
  187. param.transposeB = false;
  188. TensorShape A, B;
  189. A = TensorShape{M / 4, K / 4, 4, 4};
  190. B = TensorShape{K / 4, 1, 4};
  191. checker.set_param(param)
  192. .set_dtype(0, dtype::QuantizedS8(2.5f))
  193. .set_dtype(1, dtype::QuantizedS8(2.5f))
  194. .set_dtype(2, dtype::QuantizedS32(6.25f))
  195. .execs({A, B, {}});
  196. };
  197. // N = 1
  198. for (size_t M : {4, 16, 128, 1024})
  199. for (size_t K : {4, 8, 12, 16, 20, 24, 256, 1024})
  200. run(M, K, 1);
  201. }
  202. #endif
  203. TEST_F(ARM_COMMON, QINT8x8x32_GEVM) {
  204. Checker<MatrixMul> checker(handle());
  205. using Param = MatrixMul::Param;
  206. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_GEVM"));
  207. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  208. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  209. auto run = [&](size_t M, size_t K, size_t N) {
  210. Param param;
  211. param.transposeA = false;
  212. param.transposeB = true;
  213. TensorShape A, B;
  214. A = TensorShape{M, K};
  215. B = TensorShape{N, K};
  216. checker.set_param(param)
  217. .set_dtype(0, dtype::QuantizedS8(2.5f))
  218. .set_dtype(1, dtype::QuantizedS8(2.5f))
  219. .set_dtype(2, dtype::QuantizedS32(6.25f))
  220. .execs({A, B, {}});
  221. };
  222. // M = 1
  223. for (size_t N : {1, 10, 16, 33, 64})
  224. for (size_t K : {7, 512, 1024})
  225. for (size_t M : {1})
  226. run(M, K, N);
  227. }
  228. TEST_F(ARM_COMMON, FP32_GEVM) {
  229. Checker<MatrixMul> checker(handle());
  230. using Param = MatrixMul::Param;
  231. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_GEVM"));
  232. checker.set_epsilon(1e-2);
  233. auto run = [&](size_t M, size_t K, size_t N) {
  234. Param param;
  235. param.transposeA = false;
  236. param.transposeB = true;
  237. TensorShape A, B;
  238. A = TensorShape{M, K};
  239. B = TensorShape{N, K};
  240. checker.set_param(param).execs({A, B, {}});
  241. };
  242. // M = 1
  243. for (size_t M : {1})
  244. for (size_t K : {1000, 4096, 25088})
  245. for (size_t N : {1000, 4096})
  246. run(M, K, N);
  247. }
  248. TEST_F(ARM_COMMON, FP32_GEMV_MK4) {
  249. Checker<MatrixMul> checker(handle());
  250. using Param = MatrixMul::Param;
  251. checker.set_before_exec_callback(
  252. AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV_MK4"));
  253. checker.set_epsilon(1e-2);
  254. auto run = [&](size_t M, size_t K) {
  255. Param param;
  256. param.format = param::MatrixMul::Format::MK4;
  257. param.transposeA = false;
  258. param.transposeB = false;
  259. TensorShape A, B;
  260. A = TensorShape{M / 4, K / 4, 4, 4};
  261. B = TensorShape{K / 4, 1, 4};
  262. checker.set_param(param).execs({A, B, {}});
  263. };
  264. // N = 1
  265. for (size_t M : {4, 16, 128, 1024})
  266. for (size_t K : {4, 8, 12, 128, 256, 4096})
  267. run(M, K);
  268. }
  269. #if MEGDNN_WITH_BENCHMARK
  270. TEST_F(ARM_COMMON, BENCHMARK_SGEMV) {
  271. int exec_times = 10;
  272. Benchmarker<MatrixMul> benchmarker(handle());
  273. benchmarker.set_times(exec_times);
  274. auto run = [&](size_t M, size_t K, size_t N) {
  275. printf("SGEMV: (%zu, %zu, %zu)\n", M, K, N);
  276. benchmarker.set_dtype(0, dtype::Float32())
  277. .set_dtype(1, dtype::Float32());
  278. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  279. auto computations = 2.f * M * K * N * 1e-6;
  280. auto perf = computations / time;
  281. printf("gemv fp32, Performance is %f Gflops\n", perf);
  282. };
  283. printf("warm up:\n");
  284. for (int i = 0; i < 50; i++) {
  285. benchmarker.set_dtype(0, dtype::Float32())
  286. .set_dtype(1, dtype::Float32())
  287. .set_display(false)
  288. .exec({{2, 1024}, {1024, 512}, {}});
  289. benchmarker.set_display(true);
  290. }
  291. // run gemv
  292. for (size_t M : {1, 2, 4, 8})
  293. for (size_t K : {1024, 1536, 2048})
  294. for (size_t N : {512, 1024})
  295. run(M, K, N);
  296. for (size_t M : {4, 64, 1024, 4096})
  297. for (size_t K : {128, 256, 1024, 4096})
  298. run(M, K, 1);
  299. }
  300. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP32) {
  301. int exec_times = 50;
  302. Benchmarker<MatrixMul> benchmarker(handle());
  303. benchmarker.set_times(exec_times);
  304. benchmarker.set_before_exec_callback(
  305. AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  306. auto run = [&](size_t M, size_t K, size_t N) {
  307. printf("SGEMV: (%zu, %zu, %zu)\n", M, K, N);
  308. benchmarker.set_dtype(0, dtype::Float32())
  309. .set_dtype(1, dtype::Float32())
  310. .set_dtype(2, dtype::Float32());
  311. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  312. auto computations = 2 * M * K * N * 1e-6;
  313. auto perf = computations / time;
  314. printf("gemv fp32, Performance is %f Gflops\n", perf);
  315. };
  316. printf("warm up:\n");
  317. for (int i = 0; i < 50; i++) {
  318. benchmarker.set_dtype(0, dtype::Float32())
  319. .set_dtype(1, dtype::Float32())
  320. .set_display(false)
  321. .exec({{2, 1024}, {1024, 512}, {}});
  322. benchmarker.set_display(true);
  323. }
  324. // run gemv
  325. run(12, 48, 1);
  326. run(48, 12, 1);
  327. run(32, 128, 1);
  328. run(128, 32, 1);
  329. run(64, 256, 1);
  330. run(256, 64, 1);
  331. run(128, 512, 1);
  332. run(512, 128, 1);
  333. run(256, 1024, 1);
  334. run(1024, 256, 1);
  335. }
  336. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_MK4) {
  337. int exec_times = 10;
  338. using Param = MatrixMul::Param;
  339. Param param;
  340. param.format = param::MatrixMul::Format::MK4;
  341. param.transposeA = false;
  342. param.transposeB = false;
  343. Benchmarker<MatrixMul> benchmarker(handle());
  344. benchmarker.set_times(exec_times);
  345. benchmarker.set_dtype(0, dtype::Float32())
  346. .set_dtype(1, dtype::Float32())
  347. .set_param(param);
  348. auto run = [&](size_t M, size_t K) {
  349. printf("SGEMV_MK4: (%zu, %zu)\n", M, K);
  350. TensorShape A, B;
  351. A = TensorShape{M / 4, K / 4, 4, 4};
  352. B = TensorShape{K / 4, 1, 4};
  353. auto time = benchmarker.exec({A, B, {}}) / exec_times;
  354. auto computations = 2.f * M * K * 1e-6;
  355. auto perf = computations / time;
  356. printf("gemv mk4 fp32, Performance is %f Gflops\n", perf);
  357. };
  358. printf("warm up:\n");
  359. for (int i = 0; i < 50; i++) {
  360. benchmarker.set_dtype(0, dtype::Float32())
  361. .set_dtype(1, dtype::Float32())
  362. .set_dtype(2, dtype::Float32())
  363. .set_display(false)
  364. .exec({{4, 256, 4, 4}, {256, 1, 4}, {}});
  365. }
  366. // run gemv mk4
  367. for (size_t M : {4, 64, 1024, 4096})
  368. for (size_t K : {128, 1024, 4096})
  369. run(M, K);
  370. }
  371. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP16) {
  372. int exec_times = 50;
  373. Benchmarker<MatrixMul> benchmarker(handle());
  374. benchmarker.set_times(exec_times);
  375. benchmarker.set_before_exec_callback(
  376. AlgoChecker<MatrixMul>("ARM_COMMON_F16_GEMV"));
  377. auto run = [&](size_t M, size_t K, size_t N) {
  378. printf("SGEMV_FP16: (%zu, %zu, %zu)\n", M, K, N);
  379. benchmarker.set_dtype(0, dtype::Float16())
  380. .set_dtype(1, dtype::Float16())
  381. .set_dtype(2, dtype::Float16());
  382. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  383. auto computations = 2 * M * K * N * 1e-6;
  384. auto perf = computations / time;
  385. printf("gemv fp16, Performance is %f Gflops\n", perf);
  386. };
  387. printf("warm up:\n");
  388. for (int i = 0; i < 50; i++) {
  389. benchmarker.set_dtype(0, dtype::Float16())
  390. .set_dtype(1, dtype::Float16())
  391. .set_dtype(2, dtype::Float16())
  392. .set_display(false)
  393. .exec({{2, 1024}, {1024, 512}, {}});
  394. benchmarker.set_display(true);
  395. }
  396. // run gemv
  397. for (size_t M : {1, 2, 3, 4})
  398. for (size_t K : {1024, 1536, 2048})
  399. for (size_t N : {512, 1024})
  400. run(M, K, N);
  401. }
  402. TEST_F(ARM_COMMON, BENCHMARK_SGEMM) {
  403. int exec_times = 10;
  404. Benchmarker<MatrixMul> benchmarker(handle());
  405. benchmarker.set_times(exec_times);
  406. float mod = 1000 * exec_times / 1e9;
  407. auto run = [&](size_t M, size_t K, size_t N) {
  408. float time = 1.f, perf = 1.f;
  409. printf("SGEMM: (%zu, %zu, %zu)\n", M, K, N);
  410. benchmarker.set_dtype(0, dtype::Float32())
  411. .set_dtype(1, dtype::Float32());
  412. time = benchmarker.exec({{M, K}, {K, N}, {}});
  413. perf = 2.f * M * K * N / time * mod;
  414. printf("gemm, Performance is %f Gflops\n", perf);
  415. };
  416. printf("warm up:\n");
  417. for (int i = 0; i < 50; i++) {
  418. benchmarker.set_dtype(0, dtype::Float32())
  419. .set_dtype(1, dtype::Float32())
  420. .set_display(false)
  421. .exec({{2, 1024}, {1024, 512}, {}});
  422. benchmarker.set_display(true);
  423. }
  424. run(256, 12 * 24, 256);
  425. //////////////////////// gemv //////////////////////////
  426. for (size_t M : {8, 64, 112, 256}) {
  427. for (size_t K : {8, 64, 112, 256}) {
  428. run(M, 1, K);
  429. }
  430. }
  431. //////////////////////// gemm //////////////////////////
  432. for (size_t M : {8, 64, 112, 256}) {
  433. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  434. for (size_t N : {8, 64, 112, 256}) {
  435. run(M, N, K);
  436. }
  437. }
  438. }
  439. }
  440. TEST_F(ARM_COMMON, BENCHMARK_MATRIX_MUL_INT8x8x32) {
  441. constexpr size_t RUNS = 50;
  442. param::MatrixMul param;
  443. Benchmarker<MatrixMul> benchmarker_int(handle());
  444. benchmarker_int.set_times(RUNS)
  445. .set_dtype(0, dtype::Int8{})
  446. .set_dtype(1, dtype::Int8{})
  447. .set_dtype(2, dtype::Int32{})
  448. .set_param(param)
  449. .set_display(false);
  450. Benchmarker<MatrixMul> benchmarker_float(handle());
  451. benchmarker_float.set_display(false).set_times(RUNS);
  452. auto run = [&](size_t M, size_t N, size_t K) {
  453. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  454. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  455. float computations = 2.f * M * K * N * 1e-6;
  456. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  457. "%f Gflops speedup: %f\n",
  458. M, K, N, float_used, computations / float_used, int_used,
  459. computations / int_used, float_used / int_used);
  460. };
  461. run(256, 12 * 24, 256);
  462. //////////////////////// gemv //////////////////////////
  463. for (size_t M : {8, 64, 112, 256}) {
  464. for (size_t K : {8, 64, 112, 256}) {
  465. run(M, 1, K);
  466. }
  467. }
  468. //////////////////////// gemm //////////////////////////
  469. for (size_t M : {8, 64, 112, 256}) {
  470. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  471. for (size_t N : {8, 64, 112, 256}) {
  472. run(M, N, K);
  473. }
  474. }
  475. }
  476. }
  477. TEST_F(ARM_COMMON, BENCHMARK_MATRIX_MUL_QUINT8) {
  478. constexpr size_t RUNS = 50;
  479. param::MatrixMul param;
  480. Benchmarker<MatrixMul> benchmarker_int(handle());
  481. benchmarker_int.set_times(RUNS)
  482. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  483. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  484. .set_dtype(2, {})
  485. .set_param(param)
  486. .set_display(false);
  487. Benchmarker<MatrixMul> benchmarker_float(handle());
  488. benchmarker_float.set_display(false).set_times(RUNS);
  489. auto run = [&](size_t M, size_t N, size_t K) {
  490. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  491. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  492. float computations = 2.f * M * K * N * 1e-6;
  493. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  494. "%f Gflops speedup: %f\n",
  495. M, K, N, float_used, computations / float_used, int_used,
  496. computations / int_used, float_used / int_used);
  497. };
  498. run(256, 12 * 24, 256);
  499. for (size_t M : {8, 64, 112, 256}) {
  500. for (size_t K : {8, 64, 112, 256}) {
  501. for (size_t N : {8, 64, 112, 256}) {
  502. run(M, N, K);
  503. }
  504. }
  505. }
  506. }
  507. TEST_F(ARM_COMMON, BENCHMARK_TRANSPOSED_MATRIX_MUL_QUINT8) {
  508. constexpr size_t RUNS = 50;
  509. param::MatrixMul param;
  510. param.transposeA = param.transposeB = true;
  511. Benchmarker<MatrixMul> benchmarker_int(handle());
  512. benchmarker_int.set_times(RUNS)
  513. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  514. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  515. .set_dtype(2, {})
  516. .set_param(param)
  517. .set_display(false);
  518. Benchmarker<MatrixMul> benchmarker_float(handle());
  519. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  520. auto run = [&](size_t M, size_t N, size_t K) {
  521. auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
  522. auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
  523. float computations = 2.f * M * K * N * 1e-6;
  524. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  525. "%f Gflops speedup: %f\n",
  526. M, K, N, float_used, computations / float_used, int_used,
  527. computations / int_used, float_used / int_used);
  528. };
  529. run(256, 12 * 24, 256);
  530. for (size_t M : {8, 64, 112, 256}) {
  531. for (size_t K : {8, 64, 112, 256}) {
  532. for (size_t N : {8, 64, 112, 256}) {
  533. run(M, N, K);
  534. }
  535. }
  536. }
  537. }
  538. #endif
  539. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台