You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 19 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535
  1. #include "test/cuda/fixture.h"
  2. #include "test/common/benchmarker.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/matrix_mul.h"
  5. #include "test/cuda/utils.h"
  6. #if defined(cuda_check)
  7. #undef cuda_check
  8. #endif
  9. #include "src/cuda/utils.h"
  10. namespace megdnn {
  11. namespace test {
  12. #if CUDA_VERSION >= 10000
  13. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
  14. if (cuda::current_device_prop().major > 7 ||
  15. (cuda::current_device_prop().major == 7 &&
  16. cuda::current_device_prop().minor >= 5)) {
  17. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION test as current "
  18. "device support wmma intrinsics\n");
  19. return;
  20. }
  21. Checker<MatrixMul> checker(handle_cuda(), false);
  22. using Param = MatrixMul::Param;
  23. Param param;
  24. param.transposeB = true;
  25. checker.set_param(param);
  26. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  27. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  28. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  29. ASSERT_THROW(checker.exec({{256, 256}, {256, 256}, {256, 256}}), MegDNNError);
  30. }
  31. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
  32. require_compute_capability(7, 5);
  33. Checker<MatrixMul> checker(handle_cuda(), false);
  34. using Param = MatrixMul::Param;
  35. Param param;
  36. param.transposeB = true;
  37. checker.set_param(param);
  38. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  39. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  40. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  41. checker.exec({{256, 256}, {256, 256}, {256, 256}});
  42. auto args = matrix_mul::get_matmul_args();
  43. for (auto arg : args) {
  44. size_t m = (arg.m + 7) / 8 * 8, n = (arg.n + 7) / 8 * 8,
  45. k = (arg.k + 31) / 32 * 32;
  46. checker.exec({{m, k}, {n, k}, {m, n}});
  47. }
  48. }
  49. #if MEGDNN_WITH_BENCHMARK
  50. TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  51. require_compute_capability(7, 5);
  52. Benchmarker<MatrixMul> bencher(handle_cuda());
  53. using Param = MatrixMul::Param;
  54. Param param;
  55. param.transposeB = true;
  56. bencher.set_param(param);
  57. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  58. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  59. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  60. for (size_t m : {256, 1024, 4096, 10240, 40960}) {
  61. for (size_t n : {256, 1024, 4096}) {
  62. for (size_t k : {512, 1024, 2048}) {
  63. bencher.set_times(400);
  64. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  65. auto gflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  66. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n,
  67. time_in_ms, gflps);
  68. }
  69. }
  70. }
  71. }
  72. TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  73. require_compute_capability(7, 5);
  74. Benchmarker<MatrixMul> bencher(handle_cuda());
  75. using Param = MatrixMul::Param;
  76. Param param;
  77. param.transposeB = true;
  78. bencher.set_param(param);
  79. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  80. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  81. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  82. bencher.set_times(400);
  83. size_t m = 4096, n = 4096, k = 81920;
  84. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  85. auto tflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  86. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n, time_in_ms,
  87. tflps);
  88. }
  89. #endif
  90. #endif
  91. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_WITH_SPETIAL_STRIDES) {
  92. require_compute_capability(6, 1);
  93. Checker<MatrixMul> checker(handle_cuda());
  94. using Param = MatrixMul::Param;
  95. Param param;
  96. DType stype = dtype::Int8();
  97. checker.set_param(param)
  98. .set_dtype(0, stype)
  99. .set_dtype(1, stype)
  100. .set_dtype(2, dtype::Int32())
  101. .set_epsilon(5e-3);
  102. size_t m = 1024, n = 1024, k = 1024;
  103. {
  104. TensorLayout A{{m, k}, {2048, 1}, dtype::Int8()},
  105. B{{k, n}, {2048, 1}, dtype::Int8()}, C{{m, n}, dtype::Int32()};
  106. checker.execl({A, B, {}});
  107. }
  108. }
  109. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_NAIVE) {
  110. require_compute_capability(6, 1);
  111. using Param = MatrixMul::Param;
  112. UniformIntRNG rng{-128, 127};
  113. Checker<MatrixMul> checker(handle_cuda());
  114. checker.set_rng(0, &rng).set_rng(1, &rng);
  115. size_t m = 1007, n = 1003, k = 129;
  116. for (unsigned mask = 0; mask < 4; ++mask) {
  117. Param param;
  118. param.transposeA = mask & 1;
  119. param.transposeB = mask & 2;
  120. TensorShape A, B;
  121. if (param.transposeA)
  122. A = TensorShape{k, m};
  123. else
  124. A = TensorShape{m, k};
  125. if (param.transposeB)
  126. B = TensorShape{n, k};
  127. else
  128. B = TensorShape{k, n};
  129. checker.set_param(param)
  130. .set_dtype(0, dtype::Int8())
  131. .set_dtype(1, dtype::Int8())
  132. .set_dtype(2, dtype::Int32())
  133. .set_epsilon(0)
  134. .execs({A, B, {}});
  135. }
  136. }
  137. TEST_F(CUDA, MATRIX_MUL_FLOAT_NAIVE) {
  138. Checker<MatrixMul> checker(handle_cuda());
  139. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("NAIVE"));
  140. using Param = MatrixMul::Param;
  141. size_t m = 12, n = 16, k = 20;
  142. std::vector<DType> dtype_array;
  143. dtype_array.push_back(dtype::Float32());
  144. dtype_array.push_back(dtype::Float16());
  145. for (DType dtype : dtype_array) {
  146. for (unsigned mask = 0; mask < 4; ++mask) {
  147. Param param;
  148. param.transposeA = mask & 1;
  149. param.transposeB = mask & 2;
  150. DType stype = dtype;
  151. TensorShape A, B;
  152. if (param.transposeA)
  153. A = TensorShape{k, m};
  154. else
  155. A = TensorShape{m, k};
  156. if (param.transposeB)
  157. B = TensorShape{n, k};
  158. else
  159. B = TensorShape{k, n};
  160. if (dtype == dtype::Float16()) {
  161. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  162. }
  163. checker.set_param(param)
  164. .set_dtype(0, stype)
  165. .set_dtype(1, stype)
  166. .set_dtype(2, dtype)
  167. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  168. .execs({A, B, {}});
  169. }
  170. }
  171. }
  172. TEST_F(CUDA, MATRIX_MUL) {
  173. Checker<MatrixMul> checker(handle_cuda());
  174. using Param = MatrixMul::Param;
  175. size_t m = 12, n = 16, k = 20;
  176. bool is_int_available = check_compute_capability(6, 1);
  177. std::vector<DType> dtype_array;
  178. dtype_array.push_back(dtype::Float32());
  179. dtype_array.push_back(dtype::Float16());
  180. dtype_array.push_back(dtype::BFloat16());
  181. if (is_int_available)
  182. dtype_array.push_back(dtype::Int32());
  183. for (DType dtype : dtype_array) {
  184. for (unsigned mask = 0; mask < 4; ++mask) {
  185. Param param;
  186. param.transposeA = mask & 1;
  187. param.transposeB = mask & 2;
  188. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  189. TensorShape A, B;
  190. if (param.transposeA)
  191. A = TensorShape{k, m};
  192. else
  193. A = TensorShape{m, k};
  194. if (param.transposeB)
  195. B = TensorShape{n, k};
  196. else
  197. B = TensorShape{k, n};
  198. if (dtype == dtype::BFloat16()) {
  199. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  200. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>(
  201. ExecutionPolicyAlgoName{"MATMUL_BFLOAT16", {{"CUBLAS", {}}}}));
  202. }
  203. checker.set_param(param)
  204. .set_dtype(0, stype)
  205. .set_dtype(1, stype)
  206. .set_dtype(2, dtype)
  207. .set_epsilon(
  208. dtype == dtype::Float16() || dtype == dtype::BFloat16()
  209. ? 5e-2
  210. : 5e-3)
  211. .execs({A, B, {}});
  212. if (dtype == dtype::BFloat16()) {
  213. checker.reset_before_exec_callback();
  214. checker.opr()->execution_policy() = {};
  215. }
  216. }
  217. }
  218. // general tests
  219. auto args = matrix_mul::get_matmul_args();
  220. for (auto arg : args) {
  221. auto m = arg.m, n = arg.n, k = arg.k;
  222. auto mask = arg.mask;
  223. Param param;
  224. param.transposeA = mask & 1;
  225. param.transposeB = mask & 2;
  226. TensorShape AS, BS, CS;
  227. if (param.transposeA)
  228. AS = TensorShape{k, m};
  229. else
  230. AS = TensorShape{m, k};
  231. if (param.transposeB)
  232. BS = TensorShape{n, k};
  233. else
  234. BS = TensorShape{k, n};
  235. CS = TensorShape{m, n};
  236. TensorLayout AL, BL, CL;
  237. if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  238. AL = TensorLayout(AS, dtype::Float32());
  239. } else {
  240. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, dtype::Float32());
  241. }
  242. if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  243. BL = TensorLayout(BS, dtype::Float32());
  244. } else {
  245. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, dtype::Float32());
  246. }
  247. if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  248. CL = TensorLayout(CS, dtype::Float32());
  249. } else {
  250. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, dtype::Float32());
  251. }
  252. checker.set_param(param).execl({AL, BL, CL});
  253. }
  254. }
  255. TEST_F(CUDA, MATRIX_MUL_CUBLASLT) {
  256. require_compute_capability(7, 5);
  257. NormalRNG normal_rng;
  258. Checker<MatrixMul> checker(handle_cuda());
  259. checker.set_rng(0, &normal_rng)
  260. .set_rng(1, &normal_rng)
  261. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  262. using Param = MatrixMul::Param;
  263. size_t m = 32, n = 32, k = 32;
  264. // test Int8 matmul
  265. {
  266. DType dtype = dtype::Int32();
  267. Param param;
  268. param.transposeA = false;
  269. param.transposeB = false;
  270. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  271. TensorShape A, B;
  272. A = TensorShape{m, k};
  273. B = TensorShape{k, n};
  274. checker.set_param(param)
  275. .set_dtype(0, stype)
  276. .set_dtype(1, stype)
  277. .set_dtype(2, dtype)
  278. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  279. .execs({A, B, {}});
  280. }
  281. // test float-point matmul
  282. for (DType dtype : std::array<DType, 2>{{dtype::Float32(), dtype::Float16()}}) {
  283. for (unsigned mask = 0; mask < 4; ++mask) {
  284. Param param;
  285. param.transposeA = mask & 1;
  286. param.transposeB = mask & 2;
  287. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  288. TensorShape A, B;
  289. if (param.transposeA)
  290. A = TensorShape{k, m};
  291. else
  292. A = TensorShape{m, k};
  293. if (param.transposeB)
  294. B = TensorShape{n, k};
  295. else
  296. B = TensorShape{k, n};
  297. checker.set_param(param)
  298. .set_dtype(0, stype)
  299. .set_dtype(1, stype)
  300. .set_dtype(2, dtype)
  301. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 8e-3)
  302. .execs({A, B, {}});
  303. }
  304. }
  305. // general tests
  306. auto args = matrix_mul::get_matmul_args();
  307. for (auto arg : args) {
  308. auto m = arg.m, n = arg.n, k = arg.k;
  309. auto mask = arg.mask;
  310. Param param;
  311. param.transposeA = mask & 1;
  312. param.transposeB = mask & 2;
  313. TensorShape AS, BS, CS;
  314. if (param.transposeA)
  315. AS = TensorShape{k, m};
  316. else
  317. AS = TensorShape{m, k};
  318. if (param.transposeB)
  319. BS = TensorShape{n, k};
  320. else
  321. BS = TensorShape{k, n};
  322. CS = TensorShape{m, n};
  323. TensorLayout AL, BL, CL;
  324. if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  325. AL = TensorLayout(AS, dtype::Float32());
  326. } else {
  327. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, dtype::Float32());
  328. }
  329. if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  330. BL = TensorLayout(BS, dtype::Float32());
  331. } else {
  332. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, dtype::Float32());
  333. }
  334. if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  335. CL = TensorLayout(CS, dtype::Float32());
  336. } else {
  337. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, dtype::Float32());
  338. }
  339. checker.set_param(param).execl({AL, BL, CL});
  340. }
  341. }
  342. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_SPECIAL_CASE) {
  343. require_compute_capability(7, 5);
  344. size_t m = 12, n = 16, k = 20;
  345. Checker<MatrixMul> checker(handle_cuda());
  346. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  347. using Param = MatrixMul::Param;
  348. Param param;
  349. DType stype = dtype::Float32();
  350. DType dtype = dtype::Float32();
  351. TensorShape A, B;
  352. param.transposeA = param.transposeB = 1;
  353. if (param.transposeA)
  354. A = TensorShape{k, m};
  355. else
  356. A = TensorShape{m, k};
  357. if (param.transposeB)
  358. B = TensorShape{n, k};
  359. else
  360. B = TensorShape{k, n};
  361. checker.set_param(param)
  362. .set_dtype(0, stype)
  363. .set_dtype(1, stype)
  364. .set_dtype(2, dtype)
  365. .set_epsilon(dtype == dtype::Float16() ? 5e-1 : 5e-2)
  366. .execs({A, B, {}});
  367. }
  368. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
  369. require_compute_capability(7, 5);
  370. NormalRNG normal_rng;
  371. Checker<MatrixMul> checker(handle_cuda());
  372. checker.set_rng(0, &normal_rng)
  373. .set_rng(1, &normal_rng)
  374. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  375. using Param = MatrixMul::Param;
  376. // size_t m = 32, n = 32, k = 32;
  377. // test Int8 matmul
  378. for (size_t m = 8; m <= 64; m += 4)
  379. for (size_t n = 8; n <= 64; n += 4)
  380. for (size_t k = 8; k <= 64; k += 4) {
  381. DType dtype = dtype::Int32();
  382. Param param;
  383. param.transposeA = false;
  384. param.transposeB = false;
  385. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  386. TensorShape A, B;
  387. A = TensorShape{m, k};
  388. B = TensorShape{k, n};
  389. checker.set_param(param)
  390. .set_dtype(0, stype)
  391. .set_dtype(1, stype)
  392. .set_dtype(2, dtype)
  393. .set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3)
  394. .execs({A, B, {}});
  395. }
  396. }
  397. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_F32) {
  398. require_compute_capability(7, 5);
  399. size_t m = 128, n = 1024, k = 18432;
  400. Checker<MatrixMul> checker(handle_cuda());
  401. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  402. using Param = MatrixMul::Param;
  403. Param param;
  404. DType stype = dtype::Float32();
  405. DType dtype = dtype::Float32();
  406. TensorShape A, B;
  407. param.transposeA = param.transposeB = 0;
  408. if (param.transposeA)
  409. A = TensorShape{k, m};
  410. else
  411. A = TensorShape{m, k};
  412. if (param.transposeB)
  413. B = TensorShape{n, k};
  414. else
  415. B = TensorShape{k, n};
  416. checker.set_param(param)
  417. .set_dtype(0, stype)
  418. .set_dtype(1, stype)
  419. .set_dtype(2, dtype)
  420. .execs({A, B, {}});
  421. }
  422. TEST_F(CUDA, MATRIX_MUL_CUDNN_F32_uncont) {
  423. Checker<MatrixMul> checker(handle_cuda());
  424. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  425. using Param = MatrixMul::Param;
  426. size_t m = 100, n = 100, k = 100;
  427. Param param;
  428. param.transposeA = 1;
  429. param.transposeB = 1;
  430. TensorLayout A{{m, k}, {128, 1}, dtype::Float32()},
  431. B{{k, n}, {128, 1}, dtype::Float32()}, C{{m, n}, dtype::Float32()};
  432. DType stype = dtype::Float32();
  433. DType dtype = dtype::Float32();
  434. checker.set_param(param)
  435. .set_dtype(0, stype)
  436. .set_dtype(1, stype)
  437. .set_dtype(2, dtype)
  438. .execl({A, B, {}});
  439. }
  440. TEST_F(CUDA, MATRIX_MUL_CUDNN_F32) {
  441. Checker<MatrixMul> checker(handle_cuda());
  442. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  443. using Param = MatrixMul::Param;
  444. for (size_t m = 8; m <= 64; m += 4) {
  445. for (size_t n = 8; n <= 64; n += 4) {
  446. for (size_t k = 8; k <= 64; k += 4) {
  447. for (unsigned mask = 0; mask < 4; ++mask) {
  448. Param param;
  449. param.transposeA = mask & 1;
  450. param.transposeB = mask & 2;
  451. DType stype = dtype::Float32();
  452. DType dtype = dtype::Float32();
  453. TensorShape A, B;
  454. if (param.transposeA)
  455. A = TensorShape{k, m};
  456. else
  457. A = TensorShape{m, k};
  458. if (param.transposeB)
  459. B = TensorShape{n, k};
  460. else
  461. B = TensorShape{k, n};
  462. checker.set_param(param)
  463. .set_dtype(0, stype)
  464. .set_dtype(1, stype)
  465. .set_dtype(2, dtype)
  466. .execs({A, B, {}});
  467. }
  468. }
  469. }
  470. }
  471. }
  472. TEST_F(CUDA, MATRIX_MUL_CUDNN_F16) {
  473. Checker<MatrixMul> checker(handle_cuda());
  474. checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("MATMUL_CONV1X1"));
  475. using Param = MatrixMul::Param;
  476. for (size_t m = 8; m <= 64; m += 4) {
  477. for (size_t n = 8; n <= 64; n += 4) {
  478. for (size_t k = 8; k <= 64; k += 4) {
  479. for (unsigned mask = 0; mask < 4; ++mask) {
  480. Param param;
  481. param.transposeA = mask & 1;
  482. param.transposeB = mask & 2;
  483. DType stype = dtype::Float16();
  484. DType dtype = dtype::Float16();
  485. TensorShape A, B;
  486. if (param.transposeA)
  487. A = TensorShape{k, m};
  488. else
  489. A = TensorShape{m, k};
  490. if (param.transposeB)
  491. B = TensorShape{n, k};
  492. else
  493. B = TensorShape{k, n};
  494. checker.set_param(param)
  495. .set_dtype(0, stype)
  496. .set_dtype(1, stype)
  497. .set_dtype(2, dtype)
  498. .set_epsilon(6e-2)
  499. .execs({A, B, {}});
  500. }
  501. }
  502. }
  503. }
  504. }
  505. } // namespace test
  506. } // namespace megdnn
  507. // vim: syntax=cpp.doxygen