You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. /**
  2. * \file dnn/test/cuda/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "test/cuda/fixture.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/matrix_mul.h"
  16. #include "src/cuda/utils.h"
  17. #if defined(cuda_check)
  18. #undef cuda_check
  19. #endif
  20. #include "test/cuda/utils.h"
  21. #include <cuda.h>
  22. namespace megdnn {
  23. namespace test {
  24. #if CUDA_VERSION >= 10000
  25. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
  26. if (cuda::current_device_prop().major > 7 ||
  27. (cuda::current_device_prop().major == 7 &&
  28. cuda::current_device_prop().minor >= 5)) {
  29. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION test as current "
  30. "device support wmma intrinsics\n");
  31. return;
  32. }
  33. Checker<MatrixMul> checker(handle_cuda(), false);
  34. using Param = MatrixMul::Param;
  35. Param param;
  36. param.transposeB = true;
  37. checker.set_param(param);
  38. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  39. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  40. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  41. ASSERT_THROW(checker.exec({{256, 256}, {256, 256}, {256, 256}}), MegDNNError);
  42. }
  43. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
  44. if (cuda::current_device_prop().major < 7 ||
  45. (cuda::current_device_prop().major == 7 &&
  46. cuda::current_device_prop().minor < 5)) {
  47. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device "
  48. "doesn't support\n");
  49. return;
  50. }
  51. Checker<MatrixMul> checker(handle_cuda(), false);
  52. using Param = MatrixMul::Param;
  53. Param param;
  54. param.transposeB = true;
  55. checker.set_param(param);
  56. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  57. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  58. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  59. checker.exec({{256, 256}, {256, 256}, {256, 256}});
  60. auto args = matrix_mul::get_matmul_args();
  61. for (auto arg : args) {
  62. size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8,
  63. k = DIVUP(arg.k, 32) * 32;
  64. checker.exec({{m, k}, {n, k}, {m, n}});
  65. }
  66. }
  67. #if MEGDNN_WITH_BENCHMARK
  68. TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  69. if (cuda::current_device_prop().major < 7 ||
  70. (cuda::current_device_prop().major == 7 &&
  71. cuda::current_device_prop().minor < 5)) {
  72. printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current "
  73. "device doesn't support\n");
  74. return;
  75. }
  76. Benchmarker<MatrixMul> bencher(handle_cuda());
  77. using Param = MatrixMul::Param;
  78. Param param;
  79. param.transposeB = true;
  80. bencher.set_param(param);
  81. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  82. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  83. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  84. for (size_t m : {256, 1024, 4096, 10240, 40960}) {
  85. for (size_t n : {256, 1024, 4096}) {
  86. for (size_t k : {512, 1024, 2048}) {
  87. bencher.set_times(400);
  88. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  89. auto gflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  90. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n,
  91. time_in_ms, gflps);
  92. }
  93. }
  94. }
  95. }
  96. TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  97. if (cuda::current_device_prop().major < 7 ||
  98. (cuda::current_device_prop().major == 7 &&
  99. cuda::current_device_prop().minor < 5)) {
  100. printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as "
  101. "current "
  102. "device doesn't support\n");
  103. return;
  104. }
  105. Benchmarker<MatrixMul> bencher(handle_cuda());
  106. using Param = MatrixMul::Param;
  107. Param param;
  108. param.transposeB = true;
  109. bencher.set_param(param);
  110. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  111. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  112. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  113. bencher.set_times(400);
  114. size_t m = 4096, n = 4096, k = 81920;
  115. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  116. auto tflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  117. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n, time_in_ms,
  118. tflps);
  119. }
  120. #endif
  121. #endif
  122. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_WITH_SPETIAL_STRIDES) {
  123. if (!cuda::is_compute_capability_required(6, 1)) {
  124. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  125. return;
  126. }
  127. Checker<MatrixMul> checker(handle_cuda());
  128. using Param = MatrixMul::Param;
  129. Param param;
  130. DType stype = dtype::Int8();
  131. checker.set_param(param)
  132. .set_dtype(0, stype)
  133. .set_dtype(1, stype)
  134. .set_dtype(2, dtype::Int32())
  135. .set_epsilon(5e-3);
  136. size_t m = 1024, n = 1024, k = 1024;
  137. {
  138. TensorLayout A{{m, k}, {2048, 1}, dtype::Int8()},
  139. B{{k, n}, {2048, 1}, dtype::Int8()}, C{{m, n}, dtype::Int32()};
  140. checker.execl({A, B, {}});
  141. }
  142. }
  143. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_NAIVE) {
  144. if (!cuda::is_compute_capability_required(6, 1)) {
  145. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  146. return;
  147. }
  148. using Param = MatrixMul::Param;
  149. UniformIntRNG rng{-128, 127};
  150. Checker<MatrixMul> checker(handle_cuda());
  151. checker.set_rng(0, &rng).set_rng(1, &rng);
  152. size_t m = 1007, n = 1003, k = 129;
  153. for (unsigned mask = 0; mask < 4; ++mask) {
  154. Param param;
  155. param.transposeA = mask & 1;
  156. param.transposeB = mask & 2;
  157. TensorShape A, B;
  158. if (param.transposeA)
  159. A = TensorShape{k, m};
  160. else
  161. A = TensorShape{m, k};
  162. if (param.transposeB)
  163. B = TensorShape{n, k};
  164. else
  165. B = TensorShape{k, n};
  166. checker.set_param(param)
  167. .set_dtype(0, dtype::Int8())
  168. .set_dtype(1, dtype::Int8())
  169. .set_dtype(2, dtype::Int32())
  170. .set_epsilon(0)
  171. .execs({A, B, {}});
  172. }
  173. }
  174. TEST_F(CUDA, MATRIX_MUL_FLOAT_NAIVE) {
  175. Checker<MatrixMul> checker(handle_cuda());
  176. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("NAIVE"));
  177. using Param = MatrixMul::Param;
  178. size_t m = 12, n = 16, k = 20;
  179. std::vector<DType> dtype_array;
  180. dtype_array.push_back(dtype::Float32());
  181. dtype_array.push_back(dtype::Float16());
  182. for (DType dtype : dtype_array) {
  183. for (unsigned mask = 0; mask < 4; ++mask) {
  184. Param param;
  185. param.transposeA = mask & 1;
  186. param.transposeB = mask & 2;
  187. DType stype = dtype;
  188. TensorShape A, B;
  189. if (param.transposeA)
  190. A = TensorShape{k, m};
  191. else
  192. A = TensorShape{m, k};
  193. if (param.transposeB)
  194. B = TensorShape{n, k};
  195. else
  196. B = TensorShape{k, n};
  197. if (dtype == dtype::Float16()) {
  198. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  199. }
  200. checker.set_param(param)
  201. .set_dtype(0, stype)
  202. .set_dtype(1, stype)
  203. .set_dtype(2, dtype)
  204. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  205. .execs({A, B, {}});
  206. }
  207. }
  208. }
  209. TEST_F(CUDA, MATRIX_MUL) {
  210. if (cuda::current_device_prop().major < 6) {
  211. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  212. return;
  213. }
  214. Checker<MatrixMul> checker(handle_cuda());
  215. using Param = MatrixMul::Param;
  216. size_t m = 12, n = 16, k = 20;
  217. bool is_int_available = cuda::is_compute_capability_required(6, 1);
  218. std::vector<DType> dtype_array;
  219. dtype_array.push_back(dtype::Float32());
  220. dtype_array.push_back(dtype::Float16());
  221. dtype_array.push_back(dtype::BFloat16());
  222. if (is_int_available)
  223. dtype_array.push_back(dtype::Int32());
  224. for (DType dtype : dtype_array) {
  225. for (unsigned mask = 0; mask < 4; ++mask) {
  226. Param param;
  227. param.transposeA = mask & 1;
  228. param.transposeB = mask & 2;
  229. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  230. TensorShape A, B;
  231. if (param.transposeA)
  232. A = TensorShape{k, m};
  233. else
  234. A = TensorShape{m, k};
  235. if (param.transposeB)
  236. B = TensorShape{n, k};
  237. else
  238. B = TensorShape{k, n};
  239. if (dtype == dtype::BFloat16()) {
  240. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  241. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>(
  242. ExecutionPolicyAlgoName{"MATMUL_BFLOAT16", {{"CUBLAS", {}}}}));
  243. }
  244. checker.set_param(param)
  245. .set_dtype(0, stype)
  246. .set_dtype(1, stype)
  247. .set_dtype(2, dtype)
  248. .set_epsilon(
  249. dtype == dtype::Float16() || dtype == dtype::BFloat16()
  250. ? 5e-2
  251. : 5e-3)
  252. .execs({A, B, {}});
  253. if (dtype == dtype::BFloat16()) {
  254. checker.reset_before_exec_callback();
  255. checker.opr()->execution_policy() = {};
  256. }
  257. }
  258. }
  259. // general tests
  260. auto args = matrix_mul::get_matmul_args();
  261. for (auto arg : args) {
  262. auto m = arg.m, n = arg.n, k = arg.k;
  263. auto mask = arg.mask;
  264. Param param;
  265. param.transposeA = mask & 1;
  266. param.transposeB = mask & 2;
  267. TensorShape AS, BS, CS;
  268. if (param.transposeA)
  269. AS = TensorShape{k, m};
  270. else
  271. AS = TensorShape{m, k};
  272. if (param.transposeB)
  273. BS = TensorShape{n, k};
  274. else
  275. BS = TensorShape{k, n};
  276. CS = TensorShape{m, n};
  277. TensorLayout AL, BL, CL;
  278. if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  279. AL = TensorLayout(AS, dtype::Float32());
  280. } else {
  281. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, dtype::Float32());
  282. }
  283. if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  284. BL = TensorLayout(BS, dtype::Float32());
  285. } else {
  286. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, dtype::Float32());
  287. }
  288. if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  289. CL = TensorLayout(CS, dtype::Float32());
  290. } else {
  291. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, dtype::Float32());
  292. }
  293. checker.set_param(param).execl({AL, BL, CL});
  294. }
  295. }
  296. TEST_F(CUDA, MATRIX_MUL_CUBLASLT) {
  297. require_compute_capability(7, 5);
  298. NormalRNG normal_rng;
  299. Checker<MatrixMul> checker(handle_cuda());
  300. checker.set_rng(0, &normal_rng)
  301. .set_rng(1, &normal_rng)
  302. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  303. using Param = MatrixMul::Param;
  304. size_t m = 32, n = 32, k = 32;
  305. // test Int8 matmul
  306. {
  307. DType dtype = dtype::Int32();
  308. Param param;
  309. param.transposeA = false;
  310. param.transposeB = false;
  311. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  312. TensorShape A, B;
  313. A = TensorShape{m, k};
  314. B = TensorShape{k, n};
  315. checker.set_param(param)
  316. .set_dtype(0, stype)
  317. .set_dtype(1, stype)
  318. .set_dtype(2, dtype)
  319. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  320. .execs({A, B, {}});
  321. }
  322. // test float-point matmul
  323. for (DType dtype : std::array<DType, 2>{{dtype::Float32(), dtype::Float16()}}) {
  324. for (unsigned mask = 0; mask < 4; ++mask) {
  325. Param param;
  326. param.transposeA = mask & 1;
  327. param.transposeB = mask & 2;
  328. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  329. TensorShape A, B;
  330. if (param.transposeA)
  331. A = TensorShape{k, m};
  332. else
  333. A = TensorShape{m, k};
  334. if (param.transposeB)
  335. B = TensorShape{n, k};
  336. else
  337. B = TensorShape{k, n};
  338. checker.set_param(param)
  339. .set_dtype(0, stype)
  340. .set_dtype(1, stype)
  341. .set_dtype(2, dtype)
  342. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 8e-3)
  343. .execs({A, B, {}});
  344. }
  345. }
  346. // general tests
  347. auto args = matrix_mul::get_matmul_args();
  348. for (auto arg : args) {
  349. auto m = arg.m, n = arg.n, k = arg.k;
  350. auto mask = arg.mask;
  351. Param param;
  352. param.transposeA = mask & 1;
  353. param.transposeB = mask & 2;
  354. TensorShape AS, BS, CS;
  355. if (param.transposeA)
  356. AS = TensorShape{k, m};
  357. else
  358. AS = TensorShape{m, k};
  359. if (param.transposeB)
  360. BS = TensorShape{n, k};
  361. else
  362. BS = TensorShape{k, n};
  363. CS = TensorShape{m, n};
  364. TensorLayout AL, BL, CL;
  365. if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  366. AL = TensorLayout(AS, dtype::Float32());
  367. } else {
  368. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, dtype::Float32());
  369. }
  370. if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  371. BL = TensorLayout(BS, dtype::Float32());
  372. } else {
  373. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, dtype::Float32());
  374. }
  375. if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  376. CL = TensorLayout(CS, dtype::Float32());
  377. } else {
  378. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, dtype::Float32());
  379. }
  380. checker.set_param(param).execl({AL, BL, CL});
  381. }
  382. }
  383. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_SPECIAL_CASE) {
  384. require_compute_capability(7, 5);
  385. size_t m = 12, n = 16, k = 20;
  386. Checker<MatrixMul> checker(handle_cuda());
  387. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  388. using Param = MatrixMul::Param;
  389. Param param;
  390. DType stype = dtype::Float32();
  391. DType dtype = dtype::Float32();
  392. TensorShape A, B;
  393. param.transposeA = param.transposeB = 1;
  394. if (param.transposeA)
  395. A = TensorShape{k, m};
  396. else
  397. A = TensorShape{m, k};
  398. if (param.transposeB)
  399. B = TensorShape{n, k};
  400. else
  401. B = TensorShape{k, n};
  402. checker.set_param(param)
  403. .set_dtype(0, stype)
  404. .set_dtype(1, stype)
  405. .set_dtype(2, dtype)
  406. .set_epsilon(dtype == dtype::Float16() ? 5e-1 : 5e-2)
  407. .execs({A, B, {}});
  408. }
  409. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
  410. require_compute_capability(7, 5);
  411. NormalRNG normal_rng;
  412. Checker<MatrixMul> checker(handle_cuda());
  413. checker.set_rng(0, &normal_rng)
  414. .set_rng(1, &normal_rng)
  415. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  416. using Param = MatrixMul::Param;
  417. // size_t m = 32, n = 32, k = 32;
  418. // test Int8 matmul
  419. for (size_t m = 8; m <= 64; m += 4)
  420. for (size_t n = 8; n <= 64; n += 4)
  421. for (size_t k = 8; k <= 64; k += 4) {
  422. DType dtype = dtype::Int32();
  423. Param param;
  424. param.transposeA = false;
  425. param.transposeB = false;
  426. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  427. TensorShape A, B;
  428. A = TensorShape{m, k};
  429. B = TensorShape{k, n};
  430. checker.set_param(param)
  431. .set_dtype(0, stype)
  432. .set_dtype(1, stype)
  433. .set_dtype(2, dtype)
  434. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  435. .execs({A, B, {}});
  436. }
  437. }
  438. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_F32) {
  439. require_compute_capability(7, 5);
  440. size_t m = 128, n = 1024, k = 18432;
  441. Checker<MatrixMul> checker(handle_cuda());
  442. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  443. using Param = MatrixMul::Param;
  444. Param param;
  445. DType stype = dtype::Float32();
  446. DType dtype = dtype::Float32();
  447. TensorShape A, B;
  448. param.transposeA = param.transposeB = 0;
  449. if (param.transposeA)
  450. A = TensorShape{k, m};
  451. else
  452. A = TensorShape{m, k};
  453. if (param.transposeB)
  454. B = TensorShape{n, k};
  455. else
  456. B = TensorShape{k, n};
  457. checker.set_param(param)
  458. .set_dtype(0, stype)
  459. .set_dtype(1, stype)
  460. .set_dtype(2, dtype)
  461. .execs({A, B, {}});
  462. }
  463. TEST_F(CUDA, MATRIX_MUL_CUDNN_F32_uncont) {
  464. Checker<MatrixMul> checker(handle_cuda());
  465. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  466. using Param = MatrixMul::Param;
  467. size_t m = 100, n = 100, k = 100;
  468. Param param;
  469. param.transposeA = 1;
  470. param.transposeB = 1;
  471. TensorLayout A{{m, k}, {128, 1}, dtype::Float32()},
  472. B{{k, n}, {128, 1}, dtype::Float32()}, C{{m, n}, dtype::Float32()};
  473. DType stype = dtype::Float32();
  474. DType dtype = dtype::Float32();
  475. checker.set_param(param)
  476. .set_dtype(0, stype)
  477. .set_dtype(1, stype)
  478. .set_dtype(2, dtype)
  479. .execl({A, B, {}});
  480. }
  481. TEST_F(CUDA, MATRIX_MUL_CUDNN_F32) {
  482. Checker<MatrixMul> checker(handle_cuda());
  483. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  484. using Param = MatrixMul::Param;
  485. for (size_t m = 8; m <= 64; m += 4) {
  486. for (size_t n = 8; n <= 64; n += 4) {
  487. for (size_t k = 8; k <= 64; k += 4) {
  488. for (unsigned mask = 0; mask < 4; ++mask) {
  489. Param param;
  490. param.transposeA = mask & 1;
  491. param.transposeB = mask & 2;
  492. DType stype = dtype::Float32();
  493. DType dtype = dtype::Float32();
  494. TensorShape A, B;
  495. if (param.transposeA)
  496. A = TensorShape{k, m};
  497. else
  498. A = TensorShape{m, k};
  499. if (param.transposeB)
  500. B = TensorShape{n, k};
  501. else
  502. B = TensorShape{k, n};
  503. checker.set_param(param)
  504. .set_dtype(0, stype)
  505. .set_dtype(1, stype)
  506. .set_dtype(2, dtype)
  507. .execs({A, B, {}});
  508. }
  509. }
  510. }
  511. }
  512. }
  513. TEST_F(CUDA, MATRIX_MUL_CUDNN_F16) {
  514. Checker<MatrixMul> checker(handle_cuda());
  515. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  516. using Param = MatrixMul::Param;
  517. for (size_t m = 8; m <= 64; m += 4) {
  518. for (size_t n = 8; n <= 64; n += 4) {
  519. for (size_t k = 8; k <= 64; k += 4) {
  520. for (unsigned mask = 0; mask < 4; ++mask) {
  521. Param param;
  522. param.transposeA = mask & 1;
  523. param.transposeB = mask & 2;
  524. DType stype = dtype::Float16();
  525. DType dtype = dtype::Float16();
  526. TensorShape A, B;
  527. if (param.transposeA)
  528. A = TensorShape{k, m};
  529. else
  530. A = TensorShape{m, k};
  531. if (param.transposeB)
  532. B = TensorShape{n, k};
  533. else
  534. B = TensorShape{k, n};
  535. checker.set_param(param)
  536. .set_dtype(0, stype)
  537. .set_dtype(1, stype)
  538. .set_dtype(2, dtype)
  539. .set_epsilon(6e-2)
  540. .execs({A, B, {}});
  541. }
  542. }
  543. }
  544. }
  545. }
  546. } // namespace test
  547. } // namespace megdnn
  548. // vim: syntax=cpp.doxygen