You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. /**
  2. * \file dnn/test/cuda/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/cuda/fixture.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/matrix_mul.h"
  16. #include "src/cuda/utils.h"
  17. #if defined(cuda_check)
  18. #undef cuda_check
  19. #endif
  20. #include "test/cuda/utils.h"
  21. #include <cuda.h>
  22. namespace megdnn {
  23. namespace test {
  24. #if CUDA_VERSION >= 10000
  25. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
  26. if (cuda::current_device_prop().major > 7 ||
  27. (cuda::current_device_prop().major == 7 &&
  28. cuda::current_device_prop().minor >= 5)) {
  29. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION test as current "
  30. "device support wmma intrinsics\n");
  31. return;
  32. }
  33. Checker<MatrixMul> checker(handle_cuda(), false);
  34. using Param = MatrixMul::Param;
  35. Param param;
  36. param.transposeB = true;
  37. checker.set_param(param);
  38. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  39. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  40. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  41. ASSERT_THROW(checker.exec({{256, 256}, {256, 256}, {256, 256}}),
  42. MegDNNError);
  43. }
  44. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
  45. if (cuda::current_device_prop().major < 7 ||
  46. (cuda::current_device_prop().major == 7 &&
  47. cuda::current_device_prop().minor < 5)) {
  48. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device "
  49. "doesn't support\n");
  50. return;
  51. }
  52. Checker<MatrixMul> checker(handle_cuda(), false);
  53. using Param = MatrixMul::Param;
  54. Param param;
  55. param.transposeB = true;
  56. checker.set_param(param);
  57. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  58. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  59. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  60. checker.exec({{256, 256}, {256, 256}, {256, 256}});
  61. auto args = matrix_mul::get_matmul_args();
  62. for (auto arg : args) {
  63. size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8,
  64. k = DIVUP(arg.k, 32) * 32;
  65. checker.exec({{m, k}, {n, k}, {m, n}});
  66. }
  67. }
  68. #if MEGDNN_WITH_BENCHMARK
  69. TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  70. if (cuda::current_device_prop().major < 7 ||
  71. (cuda::current_device_prop().major == 7 &&
  72. cuda::current_device_prop().minor < 5)) {
  73. printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current "
  74. "device doesn't support\n");
  75. return;
  76. }
  77. Benchmarker<MatrixMul> bencher(handle_cuda());
  78. using Param = MatrixMul::Param;
  79. Param param;
  80. param.transposeB = true;
  81. bencher.set_param(param);
  82. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  83. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  84. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  85. for (size_t m : {256, 1024, 4096, 10240, 40960}) {
  86. for (size_t n : {256, 1024, 4096}) {
  87. for (size_t k : {512, 1024, 2048}) {
  88. bencher.set_times(400);
  89. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  90. auto gflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  91. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m,
  92. k, n, time_in_ms, gflps);
  93. }
  94. }
  95. }
  96. }
  97. TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  98. if (cuda::current_device_prop().major < 7 ||
  99. (cuda::current_device_prop().major == 7 &&
  100. cuda::current_device_prop().minor < 5)) {
  101. printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as "
  102. "current "
  103. "device doesn't support\n");
  104. return;
  105. }
  106. Benchmarker<MatrixMul> bencher(handle_cuda());
  107. using Param = MatrixMul::Param;
  108. Param param;
  109. param.transposeB = true;
  110. bencher.set_param(param);
  111. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  112. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  113. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  114. bencher.set_times(400);
  115. size_t m = 4096, n = 4096, k = 81920;
  116. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  117. auto tflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  118. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n,
  119. time_in_ms, tflps);
  120. }
  121. #endif
  122. #endif
  123. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_WITH_SPETIAL_STRIDES) {
  124. if (!cuda::is_compute_capability_required(6, 1)) {
  125. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  126. return;
  127. }
  128. Checker<MatrixMul> checker(handle_cuda());
  129. using Param = MatrixMul::Param;
  130. Param param;
  131. DType stype = dtype::Int8();
  132. checker.set_param(param)
  133. .set_dtype(0, stype)
  134. .set_dtype(1, stype)
  135. .set_dtype(2, dtype::Int32())
  136. .set_epsilon(5e-3);
  137. size_t m = 1024, n = 1024, k = 1024;
  138. {
  139. TensorLayout A{{m, k}, {2048, 1}, dtype::Int8()},
  140. B{{k, n}, {2048, 1}, dtype::Int8()}, C{{m, n}, dtype::Int32()};
  141. checker.execl({A, B, {}});
  142. }
  143. }
  144. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_NAIVE) {
  145. if (!cuda::is_compute_capability_required(6, 1)) {
  146. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  147. return;
  148. }
  149. using Param = MatrixMul::Param;
  150. UniformIntRNG rng{-128, 127};
  151. Checker<MatrixMul> checker(handle_cuda());
  152. checker.set_rng(0, &rng).set_rng(1, &rng);
  153. size_t m = 1007, n = 1003, k = 129;
  154. for (unsigned mask = 0; mask < 4; ++mask) {
  155. Param param;
  156. param.transposeA = mask & 1;
  157. param.transposeB = mask & 2;
  158. TensorShape A, B;
  159. if (param.transposeA)
  160. A = TensorShape{k, m};
  161. else
  162. A = TensorShape{m, k};
  163. if (param.transposeB)
  164. B = TensorShape{n, k};
  165. else
  166. B = TensorShape{k, n};
  167. checker.set_param(param)
  168. .set_dtype(0, dtype::Int8())
  169. .set_dtype(1, dtype::Int8())
  170. .set_dtype(2, dtype::Int32())
  171. .set_epsilon(0)
  172. .execs({A, B, {}});
  173. }
  174. }
  175. TEST_F(CUDA, MATRIX_MUL_FLOAT_NAIVE) {
  176. Checker<MatrixMul> checker(handle_cuda());
  177. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("NAIVE"));
  178. using Param = MatrixMul::Param;
  179. size_t m = 12, n = 16, k = 20;
  180. std::vector<DType> dtype_array;
  181. dtype_array.push_back(dtype::Float32());
  182. dtype_array.push_back(dtype::Float16());
  183. for (DType dtype : dtype_array) {
  184. for (unsigned mask = 0; mask < 4; ++mask) {
  185. Param param;
  186. param.transposeA = mask & 1;
  187. param.transposeB = mask & 2;
  188. DType stype = dtype;
  189. TensorShape A, B;
  190. if (param.transposeA)
  191. A = TensorShape{k, m};
  192. else
  193. A = TensorShape{m, k};
  194. if (param.transposeB)
  195. B = TensorShape{n, k};
  196. else
  197. B = TensorShape{k, n};
  198. if (dtype == dtype::Float16()) {
  199. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  200. }
  201. checker.set_param(param)
  202. .set_dtype(0, stype)
  203. .set_dtype(1, stype)
  204. .set_dtype(2, dtype)
  205. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  206. .execs({A, B, {}});
  207. }
  208. }
  209. }
  210. TEST_F(CUDA, MATRIX_MUL) {
  211. if (cuda::current_device_prop().major < 6) {
  212. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  213. return;
  214. }
  215. Checker<MatrixMul> checker(handle_cuda());
  216. using Param = MatrixMul::Param;
  217. size_t m = 12, n = 16, k = 20;
  218. bool is_int_available = cuda::is_compute_capability_required(6, 1);
  219. std::vector<DType> dtype_array;
  220. dtype_array.push_back(dtype::Float32());
  221. dtype_array.push_back(dtype::Float16());
  222. dtype_array.push_back(dtype::BFloat16());
  223. if (is_int_available)
  224. dtype_array.push_back(dtype::Int32());
  225. for (DType dtype : dtype_array) {
  226. for (unsigned mask = 0; mask < 4; ++mask) {
  227. Param param;
  228. param.transposeA = mask & 1;
  229. param.transposeB = mask & 2;
  230. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  231. TensorShape A, B;
  232. if (param.transposeA)
  233. A = TensorShape{k, m};
  234. else
  235. A = TensorShape{m, k};
  236. if (param.transposeB)
  237. B = TensorShape{n, k};
  238. else
  239. B = TensorShape{k, n};
  240. if (dtype == dtype::BFloat16()) {
  241. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  242. checker.set_before_exec_callback(
  243. AlgoChecker<MatrixMulForward>(ExecutionPolicyAlgoName{
  244. "MATMUL_BFLOAT16", {{"CUBLAS", {}}}}));
  245. }
  246. checker.set_param(param)
  247. .set_dtype(0, stype)
  248. .set_dtype(1, stype)
  249. .set_dtype(2, dtype)
  250. .set_epsilon(dtype == dtype::Float16() ||
  251. dtype == dtype::BFloat16()
  252. ? 5e-2
  253. : 5e-3)
  254. .execs({A, B, {}});
  255. if (dtype == dtype::BFloat16()) {
  256. checker.reset_before_exec_callback();
  257. checker.opr()->execution_policy() = {};
  258. }
  259. }
  260. }
  261. // general tests
  262. auto args = matrix_mul::get_matmul_args();
  263. for (auto arg : args) {
  264. auto m = arg.m, n = arg.n, k = arg.k;
  265. auto mask = arg.mask;
  266. Param param;
  267. param.transposeA = mask & 1;
  268. param.transposeB = mask & 2;
  269. TensorShape AS, BS, CS;
  270. if (param.transposeA)
  271. AS = TensorShape{k, m};
  272. else
  273. AS = TensorShape{m, k};
  274. if (param.transposeB)
  275. BS = TensorShape{n, k};
  276. else
  277. BS = TensorShape{k, n};
  278. CS = TensorShape{m, n};
  279. TensorLayout AL, BL, CL;
  280. if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  281. AL = TensorLayout(AS, dtype::Float32());
  282. } else {
  283. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
  284. dtype::Float32());
  285. }
  286. if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  287. BL = TensorLayout(BS, dtype::Float32());
  288. } else {
  289. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
  290. dtype::Float32());
  291. }
  292. if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  293. CL = TensorLayout(CS, dtype::Float32());
  294. } else {
  295. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
  296. dtype::Float32());
  297. }
  298. checker.set_param(param).execl({AL, BL, CL});
  299. }
  300. }
  301. TEST_F(CUDA, MATRIX_MUL_CUBLASLT) {
  302. require_compute_capability(7, 5);
  303. NormalRNG normal_rng;
  304. Checker<MatrixMul> checker(handle_cuda());
  305. checker.set_rng(0, &normal_rng)
  306. .set_rng(1, &normal_rng)
  307. .set_before_exec_callback(
  308. AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  309. using Param = MatrixMul::Param;
  310. size_t m = 32, n = 32, k = 32;
  311. // test Int8 matmul
  312. {
  313. DType dtype = dtype::Int32();
  314. Param param;
  315. param.transposeA = false;
  316. param.transposeB = false;
  317. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  318. TensorShape A, B;
  319. A = TensorShape{m, k};
  320. B = TensorShape{k, n};
  321. checker.set_param(param)
  322. .set_dtype(0, stype)
  323. .set_dtype(1, stype)
  324. .set_dtype(2, dtype)
  325. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  326. .execs({A, B, {}});
  327. }
  328. // test float-point matmul
  329. for (DType dtype :
  330. std::array<DType, 2>{{dtype::Float32(), dtype::Float16()}}) {
  331. for (unsigned mask = 0; mask < 4; ++mask) {
  332. Param param;
  333. param.transposeA = mask & 1;
  334. param.transposeB = mask & 2;
  335. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  336. TensorShape A, B;
  337. if (param.transposeA)
  338. A = TensorShape{k, m};
  339. else
  340. A = TensorShape{m, k};
  341. if (param.transposeB)
  342. B = TensorShape{n, k};
  343. else
  344. B = TensorShape{k, n};
  345. checker.set_param(param)
  346. .set_dtype(0, stype)
  347. .set_dtype(1, stype)
  348. .set_dtype(2, dtype)
  349. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 8e-3)
  350. .execs({A, B, {}});
  351. }
  352. }
  353. // general tests
  354. auto args = matrix_mul::get_matmul_args();
  355. for (auto arg : args) {
  356. auto m = arg.m, n = arg.n, k = arg.k;
  357. auto mask = arg.mask;
  358. Param param;
  359. param.transposeA = mask & 1;
  360. param.transposeB = mask & 2;
  361. TensorShape AS, BS, CS;
  362. if (param.transposeA)
  363. AS = TensorShape{k, m};
  364. else
  365. AS = TensorShape{m, k};
  366. if (param.transposeB)
  367. BS = TensorShape{n, k};
  368. else
  369. BS = TensorShape{k, n};
  370. CS = TensorShape{m, n};
  371. TensorLayout AL, BL, CL;
  372. if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  373. AL = TensorLayout(AS, dtype::Float32());
  374. } else {
  375. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
  376. dtype::Float32());
  377. }
  378. if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  379. BL = TensorLayout(BS, dtype::Float32());
  380. } else {
  381. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
  382. dtype::Float32());
  383. }
  384. if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  385. CL = TensorLayout(CS, dtype::Float32());
  386. } else {
  387. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
  388. dtype::Float32());
  389. }
  390. checker.set_param(param).execl({AL, BL, CL});
  391. }
  392. }
  393. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_SPECIAL_CASE) {
  394. require_compute_capability(7, 5);
  395. size_t m = 12, n = 16, k = 20;
  396. Checker<MatrixMul> checker(handle_cuda());
  397. checker.set_before_exec_callback(
  398. AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  399. using Param = MatrixMul::Param;
  400. Param param;
  401. DType stype = dtype::Float32();
  402. DType dtype = dtype::Float32();
  403. TensorShape A, B;
  404. param.transposeA = param.transposeB = 1;
  405. if (param.transposeA)
  406. A = TensorShape{k, m};
  407. else
  408. A = TensorShape{m, k};
  409. if (param.transposeB)
  410. B = TensorShape{n, k};
  411. else
  412. B = TensorShape{k, n};
  413. checker.set_param(param)
  414. .set_dtype(0, stype)
  415. .set_dtype(1, stype)
  416. .set_dtype(2, dtype)
  417. .set_epsilon(dtype == dtype::Float16() ? 5e-1 : 5e-2)
  418. .execs({A, B, {}});
  419. }
  420. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
  421. require_compute_capability(7, 5);
  422. NormalRNG normal_rng;
  423. Checker<MatrixMul> checker(handle_cuda());
  424. checker.set_rng(0, &normal_rng)
  425. .set_rng(1, &normal_rng)
  426. .set_before_exec_callback(
  427. AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  428. using Param = MatrixMul::Param;
  429. // size_t m = 32, n = 32, k = 32;
  430. // test Int8 matmul
  431. for (size_t m = 8; m <= 64; m += 4)
  432. for (size_t n = 8; n <= 64; n += 4)
  433. for (size_t k = 8; k <= 64; k += 4) {
  434. DType dtype = dtype::Int32();
  435. Param param;
  436. param.transposeA = false;
  437. param.transposeB = false;
  438. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  439. TensorShape A, B;
  440. A = TensorShape{m, k};
  441. B = TensorShape{k, n};
  442. checker.set_param(param)
  443. .set_dtype(0, stype)
  444. .set_dtype(1, stype)
  445. .set_dtype(2, dtype)
  446. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  447. .execs({A, B, {}});
  448. }
  449. }
  450. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_F32) {
  451. require_compute_capability(7, 5);
  452. size_t m = 128, n = 1024, k = 18432;
  453. Checker<MatrixMul> checker(handle_cuda());
  454. checker.set_before_exec_callback(
  455. AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  456. using Param = MatrixMul::Param;
  457. Param param;
  458. DType stype = dtype::Float32();
  459. DType dtype = dtype::Float32();
  460. TensorShape A, B;
  461. param.transposeA = param.transposeB = 0;
  462. if (param.transposeA)
  463. A = TensorShape{k, m};
  464. else
  465. A = TensorShape{m, k};
  466. if (param.transposeB)
  467. B = TensorShape{n, k};
  468. else
  469. B = TensorShape{k, n};
  470. checker.set_param(param)
  471. .set_dtype(0, stype)
  472. .set_dtype(1, stype)
  473. .set_dtype(2, dtype)
  474. .execs({A, B, {}});
  475. }
  476. TEST_F(CUDA, MATRIX_MUL_CUDNN_F32_uncont) {
  477. Checker<MatrixMul> checker(handle_cuda());
  478. checker.set_before_exec_callback(
  479. AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  480. using Param = MatrixMul::Param;
  481. size_t m = 100, n = 100, k = 100;
  482. Param param;
  483. param.transposeA = 1;
  484. param.transposeB = 1;
  485. TensorLayout A{{m, k}, {128, 1}, dtype::Float32()},
  486. B{{k, n}, {128, 1}, dtype::Float32()}, C{{m, n}, dtype::Float32()};
  487. DType stype = dtype::Float32();
  488. DType dtype = dtype::Float32();
  489. checker.set_param(param)
  490. .set_dtype(0, stype)
  491. .set_dtype(1, stype)
  492. .set_dtype(2, dtype)
  493. .execl({A, B, {}});
  494. }
  495. TEST_F(CUDA, MATRIX_MUL_CUDNN_F32) {
  496. Checker<MatrixMul> checker(handle_cuda());
  497. checker.set_before_exec_callback(
  498. AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  499. using Param = MatrixMul::Param;
  500. for (size_t m = 8; m <= 64; m += 4) {
  501. for (size_t n = 8; n <= 64; n += 4) {
  502. for (size_t k = 8; k <= 64; k += 4) {
  503. for (unsigned mask = 0; mask < 4; ++mask) {
  504. Param param;
  505. param.transposeA = mask & 1;
  506. param.transposeB = mask & 2;
  507. DType stype = dtype::Float32();
  508. DType dtype = dtype::Float32();
  509. TensorShape A, B;
  510. if (param.transposeA)
  511. A = TensorShape{k, m};
  512. else
  513. A = TensorShape{m, k};
  514. if (param.transposeB)
  515. B = TensorShape{n, k};
  516. else
  517. B = TensorShape{k, n};
  518. checker.set_param(param)
  519. .set_dtype(0, stype)
  520. .set_dtype(1, stype)
  521. .set_dtype(2, dtype)
  522. .execs({A, B, {}});
  523. }
  524. }
  525. }
  526. }
  527. }
  528. TEST_F(CUDA, MATRIX_MUL_CUDNN_F16) {
  529. Checker<MatrixMul> checker(handle_cuda());
  530. checker.set_before_exec_callback(
  531. AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  532. using Param = MatrixMul::Param;
  533. for (size_t m = 8; m <= 64; m += 4) {
  534. for (size_t n = 8; n <= 64; n += 4) {
  535. for (size_t k = 8; k <= 64; k += 4) {
  536. for (unsigned mask = 0; mask < 4; ++mask) {
  537. Param param;
  538. param.transposeA = mask & 1;
  539. param.transposeB = mask & 2;
  540. DType stype = dtype::Float16();
  541. DType dtype = dtype::Float16();
  542. TensorShape A, B;
  543. if (param.transposeA)
  544. A = TensorShape{k, m};
  545. else
  546. A = TensorShape{m, k};
  547. if (param.transposeB)
  548. B = TensorShape{n, k};
  549. else
  550. B = TensorShape{k, n};
  551. checker.set_param(param)
  552. .set_dtype(0, stype)
  553. .set_dtype(1, stype)
  554. .set_dtype(2, dtype)
  555. .set_epsilon(6e-2)
  556. .execs({A, B, {}});
  557. }
  558. }
  559. }
  560. }
  561. }
  562. } // namespace test
  563. } // namespace megdnn
  564. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台