You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. /**
  2. * \file dnn/test/cuda/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/cuda/fixture.h"
  12. #include "test/common/checker.h"
  13. #include "test/common/matrix_mul.h"
  14. #include "test/common/benchmarker.h"
  15. #include "src/cuda/utils.h"
  16. #if defined(cuda_check)
  17. #undef cuda_check
  18. #endif
  19. #include "test/cuda/utils.h"
  20. #include <cuda.h>
  21. namespace megdnn {
  22. namespace test {
  23. #if CUDA_VERSION >= 10000
  24. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
  25. if (cuda::current_device_prop().major > 7 ||
  26. (cuda::current_device_prop().major == 7 &&
  27. cuda::current_device_prop().minor >= 5)) {
  28. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION test as current "
  29. "device support wmma intrinsics\n");
  30. return;
  31. }
  32. Checker<MatrixMul> checker(handle_cuda(), false);
  33. using Param = MatrixMul::Param;
  34. Param param;
  35. param.transposeB = true;
  36. checker.set_param(param);
  37. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  38. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  39. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  40. ASSERT_THROW(checker.exec({{256, 256}, {256, 256}, {256, 256}}),
  41. MegDNNError);
  42. }
  43. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
  44. if (cuda::current_device_prop().major < 7 ||
  45. (cuda::current_device_prop().major == 7 &&
  46. cuda::current_device_prop().minor < 5)) {
  47. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device doesn't support\n");
  48. return;
  49. }
  50. Checker<MatrixMul> checker(handle_cuda(), false);
  51. using Param = MatrixMul::Param;
  52. Param param;
  53. param.transposeB = true;
  54. checker.set_param(param);
  55. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  56. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  57. checker.set_dtype(2, dtype::QuantizedS32(1.3f*1.3f));
  58. checker.exec({{256, 256}, {256, 256}, {256, 256}});
  59. auto args = matrix_mul::get_matmul_args();
  60. for (auto arg : args) {
  61. size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8,
  62. k = DIVUP(arg.k, 32) * 32;
  63. checker.exec({{m, k}, {n, k}, {m, n}});
  64. }
  65. }
  66. #if MEGDNN_WITH_BENCHMARK
  67. TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  68. if (cuda::current_device_prop().major < 7 ||
  69. (cuda::current_device_prop().major == 7 &&
  70. cuda::current_device_prop().minor < 5)) {
  71. printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current "
  72. "device doesn't support\n");
  73. return;
  74. }
  75. Benchmarker<MatrixMul> bencher(handle_cuda());
  76. using Param = MatrixMul::Param;
  77. Param param;
  78. param.transposeB = true;
  79. bencher.set_param(param);
  80. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  81. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  82. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  83. for (size_t m : {256, 1024, 4096, 10240, 40960}) {
  84. for (size_t n : {256, 1024, 4096}) {
  85. for (size_t k :{512, 1024, 2048}) {
  86. bencher.set_times(400);
  87. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  88. auto gflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  89. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n",
  90. m, k, n, time_in_ms, gflps);
  91. }
  92. }
  93. }
  94. }
  95. TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  96. if (cuda::current_device_prop().major < 7 ||
  97. (cuda::current_device_prop().major == 7 &&
  98. cuda::current_device_prop().minor < 5)) {
  99. printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as "
  100. "current "
  101. "device doesn't support\n");
  102. return;
  103. }
  104. Benchmarker<MatrixMul> bencher(handle_cuda());
  105. using Param = MatrixMul::Param;
  106. Param param;
  107. param.transposeB = true;
  108. bencher.set_param(param);
  109. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  110. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  111. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  112. bencher.set_times(400);
  113. size_t m = 4096, n = 4096, k = 81920;
  114. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  115. auto tflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  116. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n,
  117. time_in_ms, tflps);
  118. }
  119. #endif
  120. #endif
  121. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_WITH_SPETIAL_STRIDES) {
  122. if (!cuda::is_compute_capability_required(6, 1)) {
  123. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  124. return;
  125. }
  126. Checker<MatrixMul> checker(handle_cuda());
  127. using Param = MatrixMul::Param;
  128. Param param;
  129. DType stype = dtype::Int8();
  130. checker.set_param(param)
  131. .set_dtype(0, stype)
  132. .set_dtype(1, stype)
  133. .set_dtype(2, dtype::Int32())
  134. .set_epsilon(5e-3);
  135. size_t m = 1024, n = 1024, k = 1024;
  136. {
  137. TensorLayout A{{m, k}, {2048, 1}, dtype::Int8()},
  138. B{{k, n}, {2048, 1}, dtype::Int8()}, C{{m, n}, dtype::Int32()};
  139. checker.execl({A, B, {}});
  140. }
  141. }
  142. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_NAIVE) {
  143. if (!cuda::is_compute_capability_required(6, 1)) {
  144. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  145. return;
  146. }
  147. using Param = MatrixMul::Param;
  148. UniformIntRNG rng{-128, 127};
  149. Checker<MatrixMul> checker(handle_cuda());
  150. checker.set_rng(0, &rng).set_rng(1, &rng);
  151. size_t m = 1007, n = 1003, k = 129;
  152. for (unsigned mask = 0; mask < 4; ++mask) {
  153. Param param;
  154. param.transposeA = mask & 1;
  155. param.transposeB = mask & 2;
  156. TensorShape A, B;
  157. if (param.transposeA)
  158. A = TensorShape{k, m};
  159. else
  160. A = TensorShape{m, k};
  161. if (param.transposeB)
  162. B = TensorShape{n, k};
  163. else
  164. B = TensorShape{k, n};
  165. checker.set_param(param)
  166. .set_dtype(0, dtype::Int8())
  167. .set_dtype(1, dtype::Int8())
  168. .set_dtype(2, dtype::Int32())
  169. .set_epsilon(0)
  170. .execs({A, B, {}});
  171. }
  172. }
  173. TEST_F(CUDA, MATRIX_MUL)
  174. {
  175. if (cuda::current_device_prop().major < 6) {
  176. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  177. return;
  178. }
  179. Checker<MatrixMul> checker(handle_cuda());
  180. using Param = MatrixMul::Param;
  181. size_t m = 12, n = 16, k = 20;
  182. bool is_int_available = cuda::is_compute_capability_required(6, 1);
  183. std::vector<DType> dtype_array;
  184. dtype_array.push_back(dtype::Float32());
  185. dtype_array.push_back(dtype::Float16());
  186. if (is_int_available)
  187. dtype_array.push_back(dtype::Int32());
  188. for (DType dtype : dtype_array) {
  189. for (unsigned mask = 0; mask < 4; ++mask) {
  190. Param param;
  191. param.transposeA = mask & 1;
  192. param.transposeB = mask & 2;
  193. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  194. TensorShape A, B;
  195. if (param.transposeA)
  196. A = TensorShape{k, m};
  197. else
  198. A = TensorShape{m, k};
  199. if (param.transposeB)
  200. B = TensorShape{n, k};
  201. else
  202. B = TensorShape{k, n};
  203. checker.set_param(param).
  204. set_dtype(0, stype).
  205. set_dtype(1, stype).
  206. set_dtype(2, dtype).
  207. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
  208. execs({A, B, {}});
  209. }
  210. }
  211. // general tests
  212. auto args = matrix_mul::get_matmul_args();
  213. for (auto arg: args) {
  214. auto m = arg.m, n = arg.n, k = arg.k;
  215. auto mask = arg.mask;
  216. Param param;
  217. param.transposeA = mask & 1;
  218. param.transposeB = mask & 2;
  219. TensorShape AS, BS, CS;
  220. if (param.transposeA)
  221. AS = TensorShape{k, m};
  222. else
  223. AS = TensorShape{m, k};
  224. if (param.transposeB)
  225. BS = TensorShape{n, k};
  226. else
  227. BS = TensorShape{k, n};
  228. CS = TensorShape{m, n};
  229. TensorLayout AL, BL, CL;
  230. if (arg.A_stride == 0) {
  231. AL = TensorLayout(AS, dtype::Float32());
  232. } else {
  233. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
  234. dtype::Float32());
  235. }
  236. if (arg.B_stride == 0) {
  237. BL = TensorLayout(BS, dtype::Float32());
  238. } else {
  239. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
  240. dtype::Float32());
  241. }
  242. if (arg.C_stride == 0) {
  243. CL = TensorLayout(CS, dtype::Float32());
  244. } else {
  245. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
  246. dtype::Float32());
  247. }
  248. checker.set_param(param).execl({AL, BL, CL});
  249. }
  250. }
  251. TEST_F(CUDA, MATRIX_MUL_CUBLASLT)
  252. {
  253. require_compute_capability(7, 5);
  254. NormalRNG normal_rng;
  255. Checker<MatrixMul> checker(handle_cuda());
  256. checker.set_rng(0, &normal_rng)
  257. .set_rng(1, &normal_rng)
  258. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  259. using Param = MatrixMul::Param;
  260. size_t m = 32, n = 32, k = 32;
  261. // test Int8 matmul
  262. {
  263. DType dtype=dtype::Int32();
  264. Param param;
  265. param.transposeA = false;
  266. param.transposeB = false;
  267. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  268. TensorShape A, B;
  269. A = TensorShape{m, k};
  270. B = TensorShape{k, n};
  271. checker.set_param(param).
  272. set_dtype(0, stype).
  273. set_dtype(1, stype).
  274. set_dtype(2, dtype).
  275. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
  276. execs({A, B, {}});
  277. }
  278. // test float-point matmul
  279. for (DType dtype: std::array<DType, 2>{
  280. {dtype::Float32(), dtype::Float16()}}) {
  281. for (unsigned mask = 0; mask < 4; ++mask) {
  282. Param param;
  283. param.transposeA = mask & 1;
  284. param.transposeB = mask & 2;
  285. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  286. TensorShape A, B;
  287. if (param.transposeA)
  288. A = TensorShape{k, m};
  289. else
  290. A = TensorShape{m, k};
  291. if (param.transposeB)
  292. B = TensorShape{n, k};
  293. else
  294. B = TensorShape{k, n};
  295. checker.set_param(param).
  296. set_dtype(0, stype).
  297. set_dtype(1, stype).
  298. set_dtype(2, dtype).
  299. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 8e-3).
  300. execs({A, B, {}});
  301. }
  302. }
  303. // general tests
  304. auto args = matrix_mul::get_matmul_args();
  305. for (auto arg: args) {
  306. auto m = arg.m, n = arg.n, k = arg.k;
  307. auto mask = arg.mask;
  308. Param param;
  309. param.transposeA = mask & 1;
  310. param.transposeB = mask & 2;
  311. TensorShape AS, BS, CS;
  312. if (param.transposeA)
  313. AS = TensorShape{k, m};
  314. else
  315. AS = TensorShape{m, k};
  316. if (param.transposeB)
  317. BS = TensorShape{n, k};
  318. else
  319. BS = TensorShape{k, n};
  320. CS = TensorShape{m, n};
  321. TensorLayout AL, BL, CL;
  322. if (arg.A_stride == 0) {
  323. AL = TensorLayout(AS, dtype::Float32());
  324. } else {
  325. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
  326. dtype::Float32());
  327. }
  328. if (arg.B_stride == 0) {
  329. BL = TensorLayout(BS, dtype::Float32());
  330. } else {
  331. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
  332. dtype::Float32());
  333. }
  334. if (arg.C_stride == 0) {
  335. CL = TensorLayout(CS, dtype::Float32());
  336. } else {
  337. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
  338. dtype::Float32());
  339. }
  340. checker.set_param(param).execl({AL, BL, CL});
  341. }
  342. }
  343. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_SPECIAL_CASE) {
  344. require_compute_capability(7, 5);
  345. size_t m = 12, n = 16, k = 20;
  346. Checker<MatrixMul> checker(handle_cuda());
  347. checker.set_before_exec_callback(
  348. AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  349. using Param = MatrixMul::Param;
  350. Param param;
  351. DType stype = dtype::Float32();
  352. DType dtype = dtype::Float32();
  353. TensorShape A, B;
  354. param.transposeA=param.transposeB=1;
  355. if (param.transposeA)
  356. A = TensorShape{k, m};
  357. else
  358. A = TensorShape{m, k};
  359. if (param.transposeB)
  360. B = TensorShape{n, k};
  361. else
  362. B = TensorShape{k, n};
  363. checker.set_param(param).
  364. set_dtype(0, stype).
  365. set_dtype(1, stype).
  366. set_dtype(2, dtype).
  367. set_epsilon(dtype == dtype::Float16() ? 5e-1 : 5e-2).
  368. execs({A, B, {}});
  369. }
  370. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
  371. require_compute_capability(7, 5);
  372. NormalRNG normal_rng;
  373. Checker<MatrixMul> checker(handle_cuda());
  374. checker.set_rng(0, &normal_rng)
  375. .set_rng(1, &normal_rng)
  376. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  377. using Param = MatrixMul::Param;
  378. //size_t m = 32, n = 32, k = 32;
  379. // test Int8 matmul
  380. for (size_t m=8; m<=64; m+=4)
  381. for (size_t n=8; n<=64; n+=4)
  382. for (size_t k=8; k<=64; k+=4)
  383. {
  384. DType dtype=dtype::Int32();
  385. Param param;
  386. param.transposeA = false;
  387. param.transposeB = false;
  388. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  389. TensorShape A, B;
  390. A = TensorShape{m, k};
  391. B = TensorShape{k, n};
  392. checker.set_param(param).
  393. set_dtype(0, stype).
  394. set_dtype(1, stype).
  395. set_dtype(2, dtype).
  396. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
  397. execs({A, B, {}});
  398. }
  399. }
  400. } // namespace test
  401. } // namespace megdnn
  402. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台