You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616
  1. /**
  2. * \file dnn/test/arm_common/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/arm_common/fixture.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/matrix_mul.h"
  16. #include "test/common/rng.h"
  17. #if MGB_ENABLE_CPUINFO
  18. #include "cpuinfo.h"
  19. #endif
  20. using namespace megdnn;
  21. using namespace test;
  22. TEST_F(ARM_COMMON, MATRIX_MUL_INT8x8x32) {
  23. matrix_mul::check_matrix_mul(
  24. dtype::Int8{}, dtype::Int8{}, dtype::Int32{}, handle());
  25. }
  26. TEST_F(ARM_COMMON, MATRIX_MUL_INT8x8x16) {
  27. matrix_mul::check_matrix_mul(
  28. dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, handle());
  29. }
  30. TEST_F(ARM_COMMON, MATRIX_MUL_QUINT8) {
  31. matrix_mul::check_matrix_mul(
  32. dtype::Quantized8Asymm(1.2f, (uint8_t)127),
  33. dtype::Quantized8Asymm(1.3f, (uint8_t)129), {}, handle());
  34. }
  35. TEST_F(ARM_COMMON, MATRIX_MUL_FP32) {
  36. Checker<MatrixMul> checker(handle());
  37. using Param = MatrixMul::Param;
  38. auto run = [&](size_t M, size_t K, size_t N) {
  39. Param param;
  40. param.transposeA = false;
  41. param.transposeB = false;
  42. TensorShape A, B;
  43. A = TensorShape{M, K};
  44. B = TensorShape{K, N};
  45. checker.set_param(param)
  46. .set_dtype(0, dtype::Float32())
  47. .set_dtype(1, dtype::Float32())
  48. .set_dtype(2, dtype::Float32())
  49. .execs({A, B, {}});
  50. };
  51. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  52. // M < 8
  53. for (size_t M : {1, 2, 3, 4, 5, 6, 7})
  54. for (size_t K : {7, 1024, 2048})
  55. for (size_t N : {7, 1024, 2056})
  56. run(M, K, N);
  57. // M = 8,K = 1, 2
  58. for (size_t M : {8})
  59. for (size_t K : {1, 2})
  60. for (size_t N : {7, 1024, 2056})
  61. run(M, K, N);
  62. // N = 1
  63. for (size_t M : {1, 2, 3, 4, 5, 6, 7})
  64. for (size_t K : {7, 1024, 2048})
  65. for (size_t N : {1})
  66. run(M, K, N);
  67. }
  68. #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  69. TEST_F(ARM_COMMON, MATRIX_MUL_FP16) {
  70. Checker<MatrixMul> checker(handle());
  71. checker.set_epsilon(1e-2);
  72. NormalRNG rng(2.f);
  73. checker.set_rng(0, &rng).set_rng(1, &rng);
  74. using Param = MatrixMul::Param;
  75. auto args = matrix_mul::get_matmul_args_no_mask();
  76. for (auto& arg : args) {
  77. size_t m = arg.m, n = arg.n, k = arg.k;
  78. Param param;
  79. param.transposeA = false;
  80. param.transposeB = false;
  81. TensorShape A, B;
  82. A = TensorShape{m, k};
  83. B = TensorShape{k, n};
  84. checker.set_param(param)
  85. .set_dtype(0, dtype::Float16())
  86. .set_dtype(1, dtype::Float16())
  87. .set_dtype(2, dtype::Float16())
  88. .execs({A, B, {}});
  89. }
  90. }
  91. TEST_F(ARM_COMMON, MATRIX_MUL_FP16_TEST) {
  92. Checker<MatrixMul> checker(handle());
  93. using Param = MatrixMul::Param;
  94. checker.set_epsilon(1e-2);
  95. NormalRNG rng(2.f);
  96. checker.set_rng(0, &rng).set_rng(1, &rng);
  97. auto run = [&](size_t M, size_t K, size_t N) {
  98. Param param;
  99. param.transposeA = false;
  100. param.transposeB = false;
  101. TensorShape A, B;
  102. A = TensorShape{M, K};
  103. B = TensorShape{K, N};
  104. checker.set_param(param)
  105. .set_dtype(0, dtype::Float16())
  106. .set_dtype(1, dtype::Float16())
  107. .set_dtype(2, dtype::Float16())
  108. .execs({A, B, {}});
  109. };
  110. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_F16_GEMV"));
  111. // M = 1, 2, 3, 4
  112. for (size_t M : {1, 2, 3, 4})
  113. for (size_t K : {7, 512, 1024})
  114. for (size_t N : {13, 1024, 2048})
  115. run(M, K, N);
  116. // N = 1
  117. for (size_t M : {1, 2, 3, 4})
  118. for (size_t K : {7, 512, 1024})
  119. for (size_t N : {1})
  120. run(M, K, N);
  121. }
  122. #endif
  123. TEST_F(ARM_COMMON, QINT8x8x32_GEMV) {
  124. Checker<MatrixMul> checker(handle());
  125. using Param = MatrixMul::Param;
  126. checker.set_before_exec_callback(
  127. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV"));
  128. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  129. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  130. auto run = [&](size_t M, size_t K, size_t N) {
  131. Param param;
  132. param.transposeA = false;
  133. param.transposeB = false;
  134. TensorShape A, B;
  135. A = TensorShape{M, K};
  136. B = TensorShape{K, N};
  137. checker.set_param(param)
  138. .set_dtype(0, dtype::QuantizedS8(2.5f))
  139. .set_dtype(1, dtype::QuantizedS8(2.5f))
  140. .set_dtype(2, dtype::QuantizedS32(6.25f))
  141. .execs({A, B, {}});
  142. };
  143. // N = 1
  144. for (size_t M : {1, 10, 16, 33, 64})
  145. for (size_t K : {7, 512, 1024})
  146. for (size_t N : {1})
  147. run(M, K, N);
  148. }
  149. TEST_F(ARM_COMMON, QINT8x8x32_GEMV_MK4) {
  150. Checker<MatrixMul> checker(handle());
  151. using Param = MatrixMul::Param;
  152. checker.set_before_exec_callback(
  153. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV_MK4"));
  154. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  155. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  156. auto run = [&](size_t M, size_t K, size_t N) {
  157. MEGDNN_MARK_USED_VAR(N);
  158. Param param;
  159. param.format = param::MatrixMul::Format::MK4;
  160. param.transposeA = false;
  161. param.transposeB = false;
  162. TensorShape A, B;
  163. A = TensorShape{M / 4, K / 4, 4, 4};
  164. B = TensorShape{K / 4, 1, 4};
  165. checker.set_param(param)
  166. .set_dtype(0, dtype::QuantizedS8(2.5f))
  167. .set_dtype(1, dtype::QuantizedS8(2.5f))
  168. .set_dtype(2, dtype::QuantizedS32(6.25f))
  169. .execs({A, B, {}});
  170. };
  171. // N = 1
  172. for (size_t M : {4, 16, 128, 1024})
  173. for (size_t K : {4, 8, 12, 16, 20, 24, 256, 1024})
  174. run(M, K, 1);
  175. }
  176. #if MGB_ENABLE_DOT
  177. TEST_F(ARM_COMMON, QINT8x8x32_GEMV_MK4_DOT) {
  178. Checker<MatrixMul> checker(handle());
  179. using Param = MatrixMul::Param;
  180. checker.set_before_exec_callback(
  181. AlgoChecker<MatrixMul>("ARM_COMMON_INT8X8X32_GEMV_MK4_DOT"));
  182. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  183. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  184. auto run = [&](size_t M, size_t K, size_t N) {
  185. Param param;
  186. param.format = param::MatrixMul::Format::MK4_DOT;
  187. param.transposeA = false;
  188. param.transposeB = false;
  189. TensorShape A, B;
  190. A = TensorShape{M / 4, K / 4, 4, 4};
  191. B = TensorShape{K / 4, 1, 4};
  192. checker.set_param(param)
  193. .set_dtype(0, dtype::QuantizedS8(2.5f))
  194. .set_dtype(1, dtype::QuantizedS8(2.5f))
  195. .set_dtype(2, dtype::QuantizedS32(6.25f))
  196. .execs({A, B, {}});
  197. };
  198. // N = 1
  199. for (size_t M : {4, 16, 128, 1024})
  200. for (size_t K : {4, 8, 12, 16, 20, 24, 256, 1024})
  201. run(M, K, 1);
  202. }
  203. #endif
  204. TEST_F(ARM_COMMON, QINT8x8x32_GEVM) {
  205. Checker<MatrixMul> checker(handle());
  206. using Param = MatrixMul::Param;
  207. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_GEVM"));
  208. std::unique_ptr<RNG> rng = std::make_unique<UniformIntRNG>(-127, 127);
  209. checker.set_rng(0, rng.get()).set_rng(1, rng.get());
  210. auto run = [&](size_t M, size_t K, size_t N) {
  211. Param param;
  212. param.transposeA = false;
  213. param.transposeB = true;
  214. TensorShape A, B;
  215. A = TensorShape{M, K};
  216. B = TensorShape{N, K};
  217. checker.set_param(param)
  218. .set_dtype(0, dtype::QuantizedS8(2.5f))
  219. .set_dtype(1, dtype::QuantizedS8(2.5f))
  220. .set_dtype(2, dtype::QuantizedS32(6.25f))
  221. .execs({A, B, {}});
  222. };
  223. // M = 1
  224. for (size_t N : {1, 10, 16, 33, 64})
  225. for (size_t K : {7, 512, 1024})
  226. for (size_t M : {1})
  227. run(M, K, N);
  228. }
  229. TEST_F(ARM_COMMON, FP32_GEVM) {
  230. Checker<MatrixMul> checker(handle());
  231. using Param = MatrixMul::Param;
  232. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_GEVM"));
  233. checker.set_epsilon(1e-2);
  234. auto run = [&](size_t M, size_t K, size_t N) {
  235. Param param;
  236. param.transposeA = false;
  237. param.transposeB = true;
  238. TensorShape A, B;
  239. A = TensorShape{M, K};
  240. B = TensorShape{N, K};
  241. checker.set_param(param).execs({A, B, {}});
  242. };
  243. // M = 1
  244. for (size_t M : {1})
  245. for (size_t K : {1000, 4096})
  246. for (size_t N : {1000, 4096})
  247. run(M, K, N);
  248. }
  249. TEST_F(ARM_COMMON, FP32_GEMV_MK4) {
  250. Checker<MatrixMul> checker(handle());
  251. using Param = MatrixMul::Param;
  252. checker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV_MK4"));
  253. checker.set_epsilon(1e-2);
  254. auto run = [&](size_t M, size_t K) {
  255. Param param;
  256. param.format = param::MatrixMul::Format::MK4;
  257. param.transposeA = false;
  258. param.transposeB = false;
  259. TensorShape A, B;
  260. A = TensorShape{M / 4, K / 4, 4, 4};
  261. B = TensorShape{K / 4, 1, 4};
  262. checker.set_param(param).execs({A, B, {}});
  263. };
  264. // N = 1
  265. for (size_t M : {4, 16, 128, 1024})
  266. for (size_t K : {4, 8, 12, 128, 256, 4096})
  267. run(M, K);
  268. }
  269. #if MEGDNN_WITH_BENCHMARK
  270. TEST_F(ARM_COMMON, BENCHMARK_SGEMV) {
  271. int exec_times = 10;
  272. Benchmarker<MatrixMul> benchmarker(handle());
  273. benchmarker.set_times(exec_times);
  274. auto run = [&](size_t M, size_t K, size_t N) {
  275. printf("SGEMV: (%zu, %zu, %zu)\n", M, K, N);
  276. benchmarker.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  277. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  278. auto computations = 2.f * M * K * N * 1e-6;
  279. auto perf = computations / time;
  280. printf("gemv fp32, Performance is %f Gflops\n", perf);
  281. };
  282. printf("warm up:\n");
  283. for (int i = 0; i < 50; i++) {
  284. benchmarker.set_dtype(0, dtype::Float32())
  285. .set_dtype(1, dtype::Float32())
  286. .set_display(false)
  287. .exec({{2, 1024}, {1024, 512}, {}});
  288. benchmarker.set_display(true);
  289. }
  290. // run gemv
  291. for (size_t M : {1, 2, 4, 8})
  292. for (size_t K : {1024, 1536, 2048})
  293. for (size_t N : {512, 1024})
  294. run(M, K, N);
  295. for (size_t M : {4, 64, 1024, 4096})
  296. for (size_t K : {128, 256, 1024, 4096})
  297. run(M, K, 1);
  298. }
  299. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP32) {
  300. int exec_times = 50;
  301. Benchmarker<MatrixMul> benchmarker(handle());
  302. benchmarker.set_times(exec_times);
  303. benchmarker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_F32_GEMV"));
  304. auto run = [&](size_t M, size_t K, size_t N) {
  305. printf("SGEMV: (%zu, %zu, %zu)\n", M, K, N);
  306. benchmarker.set_dtype(0, dtype::Float32())
  307. .set_dtype(1, dtype::Float32())
  308. .set_dtype(2, dtype::Float32());
  309. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  310. auto computations = 2 * M * K * N * 1e-6;
  311. auto perf = computations / time;
  312. printf("gemv fp32, Performance is %f Gflops\n", perf);
  313. };
  314. printf("warm up:\n");
  315. for (int i = 0; i < 50; i++) {
  316. benchmarker.set_dtype(0, dtype::Float32())
  317. .set_dtype(1, dtype::Float32())
  318. .set_display(false)
  319. .exec({{2, 1024}, {1024, 512}, {}});
  320. benchmarker.set_display(true);
  321. }
  322. // run gemv
  323. run(12, 48, 1);
  324. run(48, 12, 1);
  325. run(32, 128, 1);
  326. run(128, 32, 1);
  327. run(64, 256, 1);
  328. run(256, 64, 1);
  329. run(128, 512, 1);
  330. run(512, 128, 1);
  331. run(256, 1024, 1);
  332. run(1024, 256, 1);
  333. }
  334. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_MK4) {
  335. int exec_times = 10;
  336. using Param = MatrixMul::Param;
  337. Param param;
  338. param.format = param::MatrixMul::Format::MK4;
  339. param.transposeA = false;
  340. param.transposeB = false;
  341. Benchmarker<MatrixMul> benchmarker(handle());
  342. benchmarker.set_times(exec_times);
  343. benchmarker.set_dtype(0, dtype::Float32())
  344. .set_dtype(1, dtype::Float32())
  345. .set_param(param);
  346. auto run = [&](size_t M, size_t K) {
  347. printf("SGEMV_MK4: (%zu, %zu, 1)\n", M, K);
  348. TensorShape A, B;
  349. A = TensorShape{M / 4, K / 4, 4, 4};
  350. B = TensorShape{K / 4, 1, 4};
  351. auto time = benchmarker.exec({A, B, {}}) / exec_times;
  352. auto computations = 2.f * M * K * 1e-6;
  353. auto perf = computations / time;
  354. printf("gemv mk4 fp32, Performance is %f Gflops\n", perf);
  355. };
  356. printf("warm up:\n");
  357. for (int i = 0; i < 50; i++) {
  358. benchmarker.set_dtype(0, dtype::Float32())
  359. .set_dtype(1, dtype::Float32())
  360. .set_dtype(2, dtype::Float32())
  361. .set_display(false)
  362. .exec({{4, 256, 4, 4}, {256, 1, 4}, {}});
  363. }
  364. // run gemv mk4
  365. for (size_t M : {4, 64, 1024, 4096})
  366. for (size_t K : {128, 1024, 4096})
  367. run(M, K);
  368. }
  369. TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP16) {
  370. int exec_times = 50;
  371. Benchmarker<MatrixMul> benchmarker(handle());
  372. benchmarker.set_times(exec_times);
  373. benchmarker.set_before_exec_callback(AlgoChecker<MatrixMul>("ARM_COMMON_F16_GEMV"));
  374. auto run = [&](size_t M, size_t K, size_t N) {
  375. printf("SGEMV_FP16: (%zu, %zu, %zu)\n", M, K, N);
  376. benchmarker.set_dtype(0, dtype::Float16())
  377. .set_dtype(1, dtype::Float16())
  378. .set_dtype(2, dtype::Float16());
  379. auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times;
  380. auto computations = 2 * M * K * N * 1e-6;
  381. auto perf = computations / time;
  382. printf("gemv fp16, Performance is %f Gflops\n", perf);
  383. };
  384. printf("warm up:\n");
  385. for (int i = 0; i < 50; i++) {
  386. benchmarker.set_dtype(0, dtype::Float16())
  387. .set_dtype(1, dtype::Float16())
  388. .set_dtype(2, dtype::Float16())
  389. .set_display(false)
  390. .exec({{2, 1024}, {1024, 512}, {}});
  391. benchmarker.set_display(true);
  392. }
  393. // run gemv
  394. for (size_t M : {1, 2, 3, 4})
  395. for (size_t K : {1024, 1536, 2048})
  396. for (size_t N : {512, 1024})
  397. run(M, K, N);
  398. }
  399. TEST_F(ARM_COMMON, BENCHMARK_SGEMM) {
  400. int exec_times = 10;
  401. Benchmarker<MatrixMul> benchmarker(handle());
  402. benchmarker.set_times(exec_times);
  403. float mod = 1000 * exec_times / 1e9;
  404. auto run = [&](size_t M, size_t K, size_t N) {
  405. float time = 1.f, perf = 1.f;
  406. printf("SGEMM: (%zu, %zu, %zu)\n", M, K, N);
  407. benchmarker.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
  408. time = benchmarker.exec({{M, K}, {K, N}, {}});
  409. perf = 2.f * M * K * N / time * mod;
  410. printf("gemm, Performance is %f Gflops\n", perf);
  411. };
  412. printf("warm up:\n");
  413. for (int i = 0; i < 50; i++) {
  414. benchmarker.set_dtype(0, dtype::Float32())
  415. .set_dtype(1, dtype::Float32())
  416. .set_display(false)
  417. .exec({{2, 1024}, {1024, 512}, {}});
  418. benchmarker.set_display(true);
  419. }
  420. run(256, 12 * 24, 256);
  421. //////////////////////// gemv //////////////////////////
  422. for (size_t M : {8, 64, 112, 256}) {
  423. for (size_t K : {8, 64, 112, 256}) {
  424. run(M, 1, K);
  425. }
  426. }
  427. //////////////////////// gemm //////////////////////////
  428. for (size_t M : {8, 64, 112, 256}) {
  429. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  430. for (size_t N : {8, 64, 112, 256}) {
  431. run(M, N, K);
  432. }
  433. }
  434. }
  435. }
  436. TEST_F(ARM_COMMON, BENCHMARK_MATRIX_MUL_INT8x8x32) {
  437. constexpr size_t RUNS = 50;
  438. param::MatrixMul param;
  439. Benchmarker<MatrixMul> benchmarker_int(handle());
  440. benchmarker_int.set_times(RUNS)
  441. .set_dtype(0, dtype::Int8{})
  442. .set_dtype(1, dtype::Int8{})
  443. .set_dtype(2, dtype::Int32{})
  444. .set_param(param)
  445. .set_display(false);
  446. Benchmarker<MatrixMul> benchmarker_float(handle());
  447. benchmarker_float.set_display(false).set_times(RUNS);
  448. auto run = [&](size_t M, size_t N, size_t K) {
  449. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  450. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  451. float computations = 2.f * M * K * N * 1e-6;
  452. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  453. "%f Gflops speedup: %f\n",
  454. M, K, N, float_used, computations / float_used, int_used,
  455. computations / int_used, float_used / int_used);
  456. };
  457. run(256, 12 * 24, 256);
  458. //////////////////////// gemv //////////////////////////
  459. for (size_t M : {8, 64, 112, 256}) {
  460. for (size_t K : {8, 64, 112, 256}) {
  461. run(M, 1, K);
  462. }
  463. }
  464. //////////////////////// gemm //////////////////////////
  465. for (size_t M : {8, 64, 112, 256}) {
  466. for (size_t K : {8, 16, 32, 64, 112, 256}) {
  467. for (size_t N : {8, 64, 112, 256}) {
  468. run(M, N, K);
  469. }
  470. }
  471. }
  472. }
  473. TEST_F(ARM_COMMON, BENCHMARK_MATRIX_MUL_QUINT8) {
  474. constexpr size_t RUNS = 50;
  475. param::MatrixMul param;
  476. Benchmarker<MatrixMul> benchmarker_int(handle());
  477. benchmarker_int.set_times(RUNS)
  478. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  479. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  480. .set_dtype(2, {})
  481. .set_param(param)
  482. .set_display(false);
  483. Benchmarker<MatrixMul> benchmarker_float(handle());
  484. benchmarker_float.set_display(false).set_times(RUNS);
  485. auto run = [&](size_t M, size_t N, size_t K) {
  486. auto int_used = benchmarker_int.exec({{M, K}, {K, N}, {}}) / RUNS;
  487. auto float_used = benchmarker_float.exec({{M, K}, {K, N}, {}}) / RUNS;
  488. float computations = 2.f * M * K * N * 1e-6;
  489. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  490. "%f Gflops speedup: %f\n",
  491. M, K, N, float_used, computations / float_used, int_used,
  492. computations / int_used, float_used / int_used);
  493. };
  494. run(256, 12 * 24, 256);
  495. for (size_t M : {8, 64, 112, 256}) {
  496. for (size_t K : {8, 64, 112, 256}) {
  497. for (size_t N : {8, 64, 112, 256}) {
  498. run(M, N, K);
  499. }
  500. }
  501. }
  502. }
  503. TEST_F(ARM_COMMON, BENCHMARK_TRANSPOSED_MATRIX_MUL_QUINT8) {
  504. constexpr size_t RUNS = 50;
  505. param::MatrixMul param;
  506. param.transposeA = param.transposeB = true;
  507. Benchmarker<MatrixMul> benchmarker_int(handle());
  508. benchmarker_int.set_times(RUNS)
  509. .set_dtype(0, dtype::Quantized8Asymm(1.2f, (uint8_t)127))
  510. .set_dtype(1, dtype::Quantized8Asymm(1.3f, (uint8_t)129))
  511. .set_dtype(2, {})
  512. .set_param(param)
  513. .set_display(false);
  514. Benchmarker<MatrixMul> benchmarker_float(handle());
  515. benchmarker_float.set_param(param).set_display(false).set_times(RUNS);
  516. auto run = [&](size_t M, size_t N, size_t K) {
  517. auto int_used = benchmarker_int.exec({{K, M}, {N, K}, {}}) / RUNS;
  518. auto float_used = benchmarker_float.exec({{K, M}, {N, K}, {}}) / RUNS;
  519. float computations = 2.f * M * K * N * 1e-6;
  520. printf("run: {%zu{M} %zu{K} %zu{N}} float: %f ms %f Gflops int: %f ms "
  521. "%f Gflops speedup: %f\n",
  522. M, K, N, float_used, computations / float_used, int_used,
  523. computations / int_used, float_used / int_used);
  524. };
  525. run(256, 12 * 24, 256);
  526. for (size_t M : {8, 64, 112, 256}) {
  527. for (size_t K : {8, 64, 112, 256}) {
  528. for (size_t N : {8, 64, 112, 256}) {
  529. run(M, N, K);
  530. }
  531. }
  532. }
  533. }
  534. #endif
  535. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台