You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. /**
  2. * \file dnn/test/cuda/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/cuda/fixture.h"
  12. #include "test/common/checker.h"
  13. #include "test/common/matrix_mul.h"
  14. #include "test/common/benchmarker.h"
  15. #include "src/cuda/utils.h"
  16. #if defined(cuda_check)
  17. #undef cuda_check
  18. #endif
  19. #include "test/cuda/utils.h"
  20. #include <cuda.h>
  21. namespace megdnn {
  22. namespace test {
  23. #if CUDA_VERSION >= 10000
  24. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
  25. if (cuda::current_device_prop().major > 7 ||
  26. (cuda::current_device_prop().major == 7 &&
  27. cuda::current_device_prop().minor >= 5)) {
  28. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION test as current "
  29. "device support wmma intrinsics\n");
  30. return;
  31. }
  32. Checker<MatrixMul> checker(handle_cuda(), false);
  33. using Param = MatrixMul::Param;
  34. Param param;
  35. param.transposeB = true;
  36. checker.set_param(param);
  37. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  38. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  39. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  40. ASSERT_THROW(checker.exec({{256, 256}, {256, 256}, {256, 256}}),
  41. MegDNNError);
  42. }
  43. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
  44. if (cuda::current_device_prop().major < 7 ||
  45. (cuda::current_device_prop().major == 7 &&
  46. cuda::current_device_prop().minor < 5)) {
  47. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device "
  48. "doesn't support\n");
  49. return;
  50. }
  51. Checker<MatrixMul> checker(handle_cuda(), false);
  52. using Param = MatrixMul::Param;
  53. Param param;
  54. param.transposeB = true;
  55. checker.set_param(param);
  56. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  57. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  58. checker.set_dtype(2, dtype::QuantizedS32(1.3f*1.3f));
  59. checker.exec({{256, 256}, {256, 256}, {256, 256}});
  60. auto args = matrix_mul::get_matmul_args();
  61. for (auto arg : args) {
  62. size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8,
  63. k = DIVUP(arg.k, 32) * 32;
  64. checker.exec({{m, k}, {n, k}, {m, n}});
  65. }
  66. }
  67. #if MEGDNN_WITH_BENCHMARK
  68. TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  69. if (cuda::current_device_prop().major < 7 ||
  70. (cuda::current_device_prop().major == 7 &&
  71. cuda::current_device_prop().minor < 5)) {
  72. printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current "
  73. "device doesn't support\n");
  74. return;
  75. }
  76. Benchmarker<MatrixMul> bencher(handle_cuda());
  77. using Param = MatrixMul::Param;
  78. Param param;
  79. param.transposeB = true;
  80. bencher.set_param(param);
  81. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  82. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  83. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  84. for (size_t m : {256, 1024, 4096, 10240, 40960}) {
  85. for (size_t n : {256, 1024, 4096}) {
  86. for (size_t k :{512, 1024, 2048}) {
  87. bencher.set_times(400);
  88. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  89. auto gflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  90. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n",
  91. m, k, n, time_in_ms, gflps);
  92. }
  93. }
  94. }
  95. }
  96. TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  97. if (cuda::current_device_prop().major < 7 ||
  98. (cuda::current_device_prop().major == 7 &&
  99. cuda::current_device_prop().minor < 5)) {
  100. printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as "
  101. "current "
  102. "device doesn't support\n");
  103. return;
  104. }
  105. Benchmarker<MatrixMul> bencher(handle_cuda());
  106. using Param = MatrixMul::Param;
  107. Param param;
  108. param.transposeB = true;
  109. bencher.set_param(param);
  110. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  111. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  112. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  113. bencher.set_times(400);
  114. size_t m = 4096, n = 4096, k = 81920;
  115. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  116. auto tflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  117. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n,
  118. time_in_ms, tflps);
  119. }
  120. #endif
  121. #endif
  122. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_WITH_SPETIAL_STRIDES) {
  123. if (!cuda::is_compute_capability_required(6, 1)) {
  124. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  125. return;
  126. }
  127. Checker<MatrixMul> checker(handle_cuda());
  128. using Param = MatrixMul::Param;
  129. Param param;
  130. DType stype = dtype::Int8();
  131. checker.set_param(param)
  132. .set_dtype(0, stype)
  133. .set_dtype(1, stype)
  134. .set_dtype(2, dtype::Int32())
  135. .set_epsilon(5e-3);
  136. size_t m = 1024, n = 1024, k = 1024;
  137. {
  138. TensorLayout A{{m, k}, {2048, 1}, dtype::Int8()},
  139. B{{k, n}, {2048, 1}, dtype::Int8()}, C{{m, n}, dtype::Int32()};
  140. checker.execl({A, B, {}});
  141. }
  142. }
  143. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_NAIVE) {
  144. if (!cuda::is_compute_capability_required(6, 1)) {
  145. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  146. return;
  147. }
  148. using Param = MatrixMul::Param;
  149. UniformIntRNG rng{-128, 127};
  150. Checker<MatrixMul> checker(handle_cuda());
  151. checker.set_rng(0, &rng).set_rng(1, &rng);
  152. size_t m = 1007, n = 1003, k = 129;
  153. for (unsigned mask = 0; mask < 4; ++mask) {
  154. Param param;
  155. param.transposeA = mask & 1;
  156. param.transposeB = mask & 2;
  157. TensorShape A, B;
  158. if (param.transposeA)
  159. A = TensorShape{k, m};
  160. else
  161. A = TensorShape{m, k};
  162. if (param.transposeB)
  163. B = TensorShape{n, k};
  164. else
  165. B = TensorShape{k, n};
  166. checker.set_param(param)
  167. .set_dtype(0, dtype::Int8())
  168. .set_dtype(1, dtype::Int8())
  169. .set_dtype(2, dtype::Int32())
  170. .set_epsilon(0)
  171. .execs({A, B, {}});
  172. }
  173. }
  174. TEST_F(CUDA, MATRIX_MUL) {
  175. if (cuda::current_device_prop().major < 6) {
  176. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  177. return;
  178. }
  179. Checker<MatrixMul> checker(handle_cuda());
  180. using Param = MatrixMul::Param;
  181. size_t m = 12, n = 16, k = 20;
  182. bool is_int_available = cuda::is_compute_capability_required(6, 1);
  183. std::vector<DType> dtype_array;
  184. dtype_array.push_back(dtype::Float32());
  185. dtype_array.push_back(dtype::Float16());
  186. dtype_array.push_back(dtype::BFloat16());
  187. if (is_int_available)
  188. dtype_array.push_back(dtype::Int32());
  189. for (DType dtype : dtype_array) {
  190. for (unsigned mask = 0; mask < 4; ++mask) {
  191. Param param;
  192. param.transposeA = mask & 1;
  193. param.transposeB = mask & 2;
  194. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  195. TensorShape A, B;
  196. if (param.transposeA)
  197. A = TensorShape{k, m};
  198. else
  199. A = TensorShape{m, k};
  200. if (param.transposeB)
  201. B = TensorShape{n, k};
  202. else
  203. B = TensorShape{k, n};
  204. if (dtype == dtype::BFloat16()) {
  205. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  206. checker.set_before_exec_callback(
  207. AlgoChecker<MatrixMulForward>(ExecutionPolicyAlgoName{
  208. "MATMUL_BFLOAT16", {{"CUBLAS", {}}}}));
  209. }
  210. checker.set_param(param)
  211. .set_dtype(0, stype)
  212. .set_dtype(1, stype)
  213. .set_dtype(2, dtype)
  214. .set_epsilon(dtype == dtype::Float16() ||
  215. dtype == dtype::BFloat16()
  216. ? 5e-2
  217. : 5e-3)
  218. .execs({A, B, {}});
  219. if (dtype == dtype::BFloat16()) {
  220. checker.reset_before_exec_callback();
  221. checker.opr()->execution_policy() = {};
  222. }
  223. }
  224. }
  225. // general tests
  226. auto args = matrix_mul::get_matmul_args();
  227. for (auto arg: args) {
  228. auto m = arg.m, n = arg.n, k = arg.k;
  229. auto mask = arg.mask;
  230. Param param;
  231. param.transposeA = mask & 1;
  232. param.transposeB = mask & 2;
  233. TensorShape AS, BS, CS;
  234. if (param.transposeA)
  235. AS = TensorShape{k, m};
  236. else
  237. AS = TensorShape{m, k};
  238. if (param.transposeB)
  239. BS = TensorShape{n, k};
  240. else
  241. BS = TensorShape{k, n};
  242. CS = TensorShape{m, n};
  243. TensorLayout AL, BL, CL;
  244. if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  245. AL = TensorLayout(AS, dtype::Float32());
  246. } else {
  247. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
  248. dtype::Float32());
  249. }
  250. if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  251. BL = TensorLayout(BS, dtype::Float32());
  252. } else {
  253. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
  254. dtype::Float32());
  255. }
  256. if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  257. CL = TensorLayout(CS, dtype::Float32());
  258. } else {
  259. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
  260. dtype::Float32());
  261. }
  262. checker.set_param(param).execl({AL, BL, CL});
  263. }
  264. }
  265. TEST_F(CUDA, MATRIX_MUL_CUBLASLT)
  266. {
  267. require_compute_capability(7, 5);
  268. NormalRNG normal_rng;
  269. Checker<MatrixMul> checker(handle_cuda());
  270. checker.set_rng(0, &normal_rng)
  271. .set_rng(1, &normal_rng)
  272. .set_before_exec_callback(
  273. AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  274. using Param = MatrixMul::Param;
  275. size_t m = 32, n = 32, k = 32;
  276. // test Int8 matmul
  277. {
  278. DType dtype=dtype::Int32();
  279. Param param;
  280. param.transposeA = false;
  281. param.transposeB = false;
  282. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  283. TensorShape A, B;
  284. A = TensorShape{m, k};
  285. B = TensorShape{k, n};
  286. checker.set_param(param).
  287. set_dtype(0, stype).
  288. set_dtype(1, stype).
  289. set_dtype(2, dtype).
  290. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
  291. execs({A, B, {}});
  292. }
  293. // test float-point matmul
  294. for (DType dtype: std::array<DType, 2>{
  295. {dtype::Float32(), dtype::Float16()}}) {
  296. for (unsigned mask = 0; mask < 4; ++mask) {
  297. Param param;
  298. param.transposeA = mask & 1;
  299. param.transposeB = mask & 2;
  300. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  301. TensorShape A, B;
  302. if (param.transposeA)
  303. A = TensorShape{k, m};
  304. else
  305. A = TensorShape{m, k};
  306. if (param.transposeB)
  307. B = TensorShape{n, k};
  308. else
  309. B = TensorShape{k, n};
  310. checker.set_param(param).
  311. set_dtype(0, stype).
  312. set_dtype(1, stype).
  313. set_dtype(2, dtype).
  314. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 8e-3).
  315. execs({A, B, {}});
  316. }
  317. }
  318. // general tests
  319. auto args = matrix_mul::get_matmul_args();
  320. for (auto arg: args) {
  321. auto m = arg.m, n = arg.n, k = arg.k;
  322. auto mask = arg.mask;
  323. Param param;
  324. param.transposeA = mask & 1;
  325. param.transposeB = mask & 2;
  326. TensorShape AS, BS, CS;
  327. if (param.transposeA)
  328. AS = TensorShape{k, m};
  329. else
  330. AS = TensorShape{m, k};
  331. if (param.transposeB)
  332. BS = TensorShape{n, k};
  333. else
  334. BS = TensorShape{k, n};
  335. CS = TensorShape{m, n};
  336. TensorLayout AL, BL, CL;
  337. if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  338. AL = TensorLayout(AS, dtype::Float32());
  339. } else {
  340. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
  341. dtype::Float32());
  342. }
  343. if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  344. BL = TensorLayout(BS, dtype::Float32());
  345. } else {
  346. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
  347. dtype::Float32());
  348. }
  349. if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
  350. CL = TensorLayout(CS, dtype::Float32());
  351. } else {
  352. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
  353. dtype::Float32());
  354. }
  355. checker.set_param(param).execl({AL, BL, CL});
  356. }
  357. }
  358. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_SPECIAL_CASE) {
  359. require_compute_capability(7, 5);
  360. size_t m = 12, n = 16, k = 20;
  361. Checker<MatrixMul> checker(handle_cuda());
  362. checker.set_before_exec_callback(
  363. AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  364. using Param = MatrixMul::Param;
  365. Param param;
  366. DType stype = dtype::Float32();
  367. DType dtype = dtype::Float32();
  368. TensorShape A, B;
  369. param.transposeA=param.transposeB=1;
  370. if (param.transposeA)
  371. A = TensorShape{k, m};
  372. else
  373. A = TensorShape{m, k};
  374. if (param.transposeB)
  375. B = TensorShape{n, k};
  376. else
  377. B = TensorShape{k, n};
  378. checker.set_param(param).
  379. set_dtype(0, stype).
  380. set_dtype(1, stype).
  381. set_dtype(2, dtype).
  382. set_epsilon(dtype == dtype::Float16() ? 5e-1 : 5e-2).
  383. execs({A, B, {}});
  384. }
  385. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
  386. require_compute_capability(7, 5);
  387. NormalRNG normal_rng;
  388. Checker<MatrixMul> checker(handle_cuda());
  389. checker.set_rng(0, &normal_rng)
  390. .set_rng(1, &normal_rng)
  391. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  392. using Param = MatrixMul::Param;
  393. //size_t m = 32, n = 32, k = 32;
  394. // test Int8 matmul
  395. for (size_t m=8; m<=64; m+=4)
  396. for (size_t n=8; n<=64; n+=4)
  397. for (size_t k=8; k<=64; k+=4)
  398. {
  399. DType dtype=dtype::Int32();
  400. Param param;
  401. param.transposeA = false;
  402. param.transposeB = false;
  403. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  404. TensorShape A, B;
  405. A = TensorShape{m, k};
  406. B = TensorShape{k, n};
  407. checker.set_param(param).
  408. set_dtype(0, stype).
  409. set_dtype(1, stype).
  410. set_dtype(2, dtype).
  411. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
  412. execs({A, B, {}});
  413. }
  414. }
  415. } // namespace test
  416. } // namespace megdnn
  417. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台