You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. /**
  2. * \file dnn/test/cuda/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/cuda/fixture.h"
  12. #include "test/common/checker.h"
  13. #include "test/common/matrix_mul.h"
  14. #include "test/common/benchmarker.h"
  15. #include "src/cuda/utils.h"
  16. #if defined(cuda_check)
  17. #undef cuda_check
  18. #endif
  19. #include "test/cuda/utils.h"
  20. #include <cuda.h>
  21. namespace megdnn {
  22. namespace test {
  23. #if CUDA_VERSION >= 10000
  24. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
  25. if (cuda::current_device_prop().major > 7 ||
  26. (cuda::current_device_prop().major == 7 &&
  27. cuda::current_device_prop().minor >= 5)) {
  28. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION test as current "
  29. "device support wmma intrinsics\n");
  30. return;
  31. }
  32. Checker<MatrixMul> checker(handle_cuda(), false);
  33. using Param = MatrixMul::Param;
  34. Param param;
  35. param.transposeB = true;
  36. checker.set_param(param);
  37. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  38. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  39. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  40. ASSERT_THROW(checker.exec({{256, 256}, {256, 256}, {256, 256}}),
  41. MegDNNError);
  42. }
  43. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
  44. if (cuda::current_device_prop().major < 7 ||
  45. (cuda::current_device_prop().major == 7 &&
  46. cuda::current_device_prop().minor < 5)) {
  47. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device doesn't support\n");
  48. return;
  49. }
  50. Checker<MatrixMul> checker(handle_cuda(), false);
  51. using Param = MatrixMul::Param;
  52. Param param;
  53. param.transposeB = true;
  54. checker.set_param(param);
  55. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  56. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  57. checker.set_dtype(2, dtype::QuantizedS32(1.3f*1.3f));
  58. checker.exec({{256, 256}, {256, 256}, {256, 256}});
  59. auto args = matrix_mul::get_matmul_args();
  60. for (auto arg : args) {
  61. size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8,
  62. k = DIVUP(arg.k, 32) * 32;
  63. checker.exec({{m, k}, {n, k}, {m, n}});
  64. }
  65. }
  66. #if MEGDNN_WITH_BENCHMARK
  67. TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  68. if (cuda::current_device_prop().major < 7 ||
  69. (cuda::current_device_prop().major == 7 &&
  70. cuda::current_device_prop().minor < 5)) {
  71. printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current "
  72. "device doesn't support\n");
  73. return;
  74. }
  75. Benchmarker<MatrixMul> bencher(handle_cuda());
  76. using Param = MatrixMul::Param;
  77. Param param;
  78. param.transposeB = true;
  79. bencher.set_param(param);
  80. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  81. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  82. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  83. for (size_t m : {256, 1024, 4096, 10240, 40960}) {
  84. for (size_t n : {256, 1024, 4096}) {
  85. for (size_t k :{512, 1024, 2048}) {
  86. bencher.set_times(400);
  87. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  88. auto gflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  89. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n",
  90. m, k, n, time_in_ms, gflps);
  91. }
  92. }
  93. }
  94. }
  95. TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  96. if (cuda::current_device_prop().major < 7 ||
  97. (cuda::current_device_prop().major == 7 &&
  98. cuda::current_device_prop().minor < 5)) {
  99. printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as "
  100. "current "
  101. "device doesn't support\n");
  102. return;
  103. }
  104. Benchmarker<MatrixMul> bencher(handle_cuda());
  105. using Param = MatrixMul::Param;
  106. Param param;
  107. param.transposeB = true;
  108. bencher.set_param(param);
  109. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  110. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  111. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  112. bencher.set_times(400);
  113. size_t m = 4096, n = 4096, k = 81920;
  114. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  115. auto tflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  116. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n,
  117. time_in_ms, tflps);
  118. }
  119. #endif
  120. #endif
  121. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_WITH_SPETIAL_STRIDES) {
  122. if (!cuda::is_compute_capability_required(6, 1)) {
  123. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  124. return;
  125. }
  126. Checker<MatrixMul> checker(handle_cuda());
  127. using Param = MatrixMul::Param;
  128. Param param;
  129. DType stype = dtype::Int8();
  130. checker.set_param(param)
  131. .set_dtype(0, stype)
  132. .set_dtype(1, stype)
  133. .set_dtype(2, dtype::Int32())
  134. .set_epsilon(5e-3);
  135. size_t m = 1024, n = 1024, k = 1024;
  136. {
  137. TensorLayout A{{m, k}, {2048, 1}, dtype::Int8()},
  138. B{{k, n}, {2048, 1}, dtype::Int8()}, C{{m, n}, dtype::Int32()};
  139. checker.execl({A, B, {}});
  140. }
  141. }
  142. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_NAIVE) {
  143. if (!cuda::is_compute_capability_required(6, 1)) {
  144. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  145. return;
  146. }
  147. using Param = MatrixMul::Param;
  148. UniformIntRNG rng{-128, 127};
  149. Checker<MatrixMul> checker(handle_cuda());
  150. checker.set_rng(0, &rng).set_rng(1, &rng);
  151. size_t m = 1007, n = 1003, k = 129;
  152. for (unsigned mask = 0; mask < 4; ++mask) {
  153. Param param;
  154. param.transposeA = mask & 1;
  155. param.transposeB = mask & 2;
  156. TensorShape A, B;
  157. if (param.transposeA)
  158. A = TensorShape{k, m};
  159. else
  160. A = TensorShape{m, k};
  161. if (param.transposeB)
  162. B = TensorShape{n, k};
  163. else
  164. B = TensorShape{k, n};
  165. checker.set_param(param)
  166. .set_dtype(0, dtype::Int8())
  167. .set_dtype(1, dtype::Int8())
  168. .set_dtype(2, dtype::Int32())
  169. .set_epsilon(0)
  170. .execs({A, B, {}});
  171. }
  172. }
  173. TEST_F(CUDA, MATRIX_MUL) {
  174. if (cuda::current_device_prop().major < 6) {
  175. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  176. return;
  177. }
  178. Checker<MatrixMul> checker(handle_cuda());
  179. using Param = MatrixMul::Param;
  180. size_t m = 12, n = 16, k = 20;
  181. bool is_int_available = cuda::is_compute_capability_required(6, 1);
  182. std::vector<DType> dtype_array;
  183. dtype_array.push_back(dtype::Float32());
  184. dtype_array.push_back(dtype::Float16());
  185. dtype_array.push_back(dtype::BFloat16());
  186. if (is_int_available)
  187. dtype_array.push_back(dtype::Int32());
  188. for (DType dtype : dtype_array) {
  189. for (unsigned mask = 0; mask < 4; ++mask) {
  190. Param param;
  191. param.transposeA = mask & 1;
  192. param.transposeB = mask & 2;
  193. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  194. TensorShape A, B;
  195. if (param.transposeA)
  196. A = TensorShape{k, m};
  197. else
  198. A = TensorShape{m, k};
  199. if (param.transposeB)
  200. B = TensorShape{n, k};
  201. else
  202. B = TensorShape{k, n};
  203. if (dtype == dtype::BFloat16()) {
  204. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  205. }
  206. checker.set_param(param)
  207. .set_dtype(0, stype)
  208. .set_dtype(1, stype)
  209. .set_dtype(2, dtype)
  210. .set_epsilon(dtype == dtype::Float16() ||
  211. dtype == dtype::BFloat16()
  212. ? 5e-2
  213. : 5e-3)
  214. .execs({A, B, {}});
  215. }
  216. }
  217. // general tests
  218. auto args = matrix_mul::get_matmul_args();
  219. for (auto arg: args) {
  220. auto m = arg.m, n = arg.n, k = arg.k;
  221. auto mask = arg.mask;
  222. Param param;
  223. param.transposeA = mask & 1;
  224. param.transposeB = mask & 2;
  225. TensorShape AS, BS, CS;
  226. if (param.transposeA)
  227. AS = TensorShape{k, m};
  228. else
  229. AS = TensorShape{m, k};
  230. if (param.transposeB)
  231. BS = TensorShape{n, k};
  232. else
  233. BS = TensorShape{k, n};
  234. CS = TensorShape{m, n};
  235. TensorLayout AL, BL, CL;
  236. if (arg.A_stride == 0) {
  237. AL = TensorLayout(AS, dtype::Float32());
  238. } else {
  239. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
  240. dtype::Float32());
  241. }
  242. if (arg.B_stride == 0) {
  243. BL = TensorLayout(BS, dtype::Float32());
  244. } else {
  245. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
  246. dtype::Float32());
  247. }
  248. if (arg.C_stride == 0) {
  249. CL = TensorLayout(CS, dtype::Float32());
  250. } else {
  251. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
  252. dtype::Float32());
  253. }
  254. checker.set_param(param).execl({AL, BL, CL});
  255. }
  256. }
  257. TEST_F(CUDA, MATRIX_MUL_CUBLASLT)
  258. {
  259. require_compute_capability(7, 5);
  260. NormalRNG normal_rng;
  261. Checker<MatrixMul> checker(handle_cuda());
  262. checker.set_rng(0, &normal_rng)
  263. .set_rng(1, &normal_rng)
  264. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  265. using Param = MatrixMul::Param;
  266. size_t m = 32, n = 32, k = 32;
  267. // test Int8 matmul
  268. {
  269. DType dtype=dtype::Int32();
  270. Param param;
  271. param.transposeA = false;
  272. param.transposeB = false;
  273. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  274. TensorShape A, B;
  275. A = TensorShape{m, k};
  276. B = TensorShape{k, n};
  277. checker.set_param(param).
  278. set_dtype(0, stype).
  279. set_dtype(1, stype).
  280. set_dtype(2, dtype).
  281. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
  282. execs({A, B, {}});
  283. }
  284. // test float-point matmul
  285. for (DType dtype: std::array<DType, 2>{
  286. {dtype::Float32(), dtype::Float16()}}) {
  287. for (unsigned mask = 0; mask < 4; ++mask) {
  288. Param param;
  289. param.transposeA = mask & 1;
  290. param.transposeB = mask & 2;
  291. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  292. TensorShape A, B;
  293. if (param.transposeA)
  294. A = TensorShape{k, m};
  295. else
  296. A = TensorShape{m, k};
  297. if (param.transposeB)
  298. B = TensorShape{n, k};
  299. else
  300. B = TensorShape{k, n};
  301. checker.set_param(param).
  302. set_dtype(0, stype).
  303. set_dtype(1, stype).
  304. set_dtype(2, dtype).
  305. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 8e-3).
  306. execs({A, B, {}});
  307. }
  308. }
  309. // general tests
  310. auto args = matrix_mul::get_matmul_args();
  311. for (auto arg: args) {
  312. auto m = arg.m, n = arg.n, k = arg.k;
  313. auto mask = arg.mask;
  314. Param param;
  315. param.transposeA = mask & 1;
  316. param.transposeB = mask & 2;
  317. TensorShape AS, BS, CS;
  318. if (param.transposeA)
  319. AS = TensorShape{k, m};
  320. else
  321. AS = TensorShape{m, k};
  322. if (param.transposeB)
  323. BS = TensorShape{n, k};
  324. else
  325. BS = TensorShape{k, n};
  326. CS = TensorShape{m, n};
  327. TensorLayout AL, BL, CL;
  328. if (arg.A_stride == 0) {
  329. AL = TensorLayout(AS, dtype::Float32());
  330. } else {
  331. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
  332. dtype::Float32());
  333. }
  334. if (arg.B_stride == 0) {
  335. BL = TensorLayout(BS, dtype::Float32());
  336. } else {
  337. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
  338. dtype::Float32());
  339. }
  340. if (arg.C_stride == 0) {
  341. CL = TensorLayout(CS, dtype::Float32());
  342. } else {
  343. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
  344. dtype::Float32());
  345. }
  346. checker.set_param(param).execl({AL, BL, CL});
  347. }
  348. }
  349. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_SPECIAL_CASE) {
  350. require_compute_capability(7, 5);
  351. size_t m = 12, n = 16, k = 20;
  352. Checker<MatrixMul> checker(handle_cuda());
  353. checker.set_before_exec_callback(
  354. AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  355. using Param = MatrixMul::Param;
  356. Param param;
  357. DType stype = dtype::Float32();
  358. DType dtype = dtype::Float32();
  359. TensorShape A, B;
  360. param.transposeA=param.transposeB=1;
  361. if (param.transposeA)
  362. A = TensorShape{k, m};
  363. else
  364. A = TensorShape{m, k};
  365. if (param.transposeB)
  366. B = TensorShape{n, k};
  367. else
  368. B = TensorShape{k, n};
  369. checker.set_param(param).
  370. set_dtype(0, stype).
  371. set_dtype(1, stype).
  372. set_dtype(2, dtype).
  373. set_epsilon(dtype == dtype::Float16() ? 5e-1 : 5e-2).
  374. execs({A, B, {}});
  375. }
  376. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
  377. require_compute_capability(7, 5);
  378. NormalRNG normal_rng;
  379. Checker<MatrixMul> checker(handle_cuda());
  380. checker.set_rng(0, &normal_rng)
  381. .set_rng(1, &normal_rng)
  382. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  383. using Param = MatrixMul::Param;
  384. //size_t m = 32, n = 32, k = 32;
  385. // test Int8 matmul
  386. for (size_t m=8; m<=64; m+=4)
  387. for (size_t n=8; n<=64; n+=4)
  388. for (size_t k=8; k<=64; k+=4)
  389. {
  390. DType dtype=dtype::Int32();
  391. Param param;
  392. param.transposeA = false;
  393. param.transposeB = false;
  394. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  395. TensorShape A, B;
  396. A = TensorShape{m, k};
  397. B = TensorShape{k, n};
  398. checker.set_param(param).
  399. set_dtype(0, stype).
  400. set_dtype(1, stype).
  401. set_dtype(2, dtype).
  402. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
  403. execs({A, B, {}});
  404. }
  405. }
  406. } // namespace test
  407. } // namespace megdnn
  408. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台