You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matrix_mul.cpp 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. /**
  2. * \file dnn/test/cuda/matrix_mul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/cuda/fixture.h"
  12. #include "test/common/checker.h"
  13. #include "test/common/matrix_mul.h"
  14. #include "test/common/benchmarker.h"
  15. #include "src/cuda/utils.h"
  16. #if defined(cuda_check)
  17. #undef cuda_check
  18. #endif
  19. #include "test/cuda/utils.h"
  20. #include <cuda.h>
  21. namespace megdnn {
  22. namespace test {
  23. #if CUDA_VERSION >= 10000
  24. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
  25. if (cuda::current_device_prop().major > 7 ||
  26. (cuda::current_device_prop().major == 7 &&
  27. cuda::current_device_prop().minor >= 5)) {
  28. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION test as current "
  29. "device support wmma intrinsics\n");
  30. return;
  31. }
  32. Checker<MatrixMul> checker(handle_cuda(), false);
  33. using Param = MatrixMul::Param;
  34. Param param;
  35. param.transposeB = true;
  36. checker.set_param(param);
  37. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  38. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  39. checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
  40. ASSERT_THROW(checker.exec({{256, 256}, {256, 256}, {256, 256}}),
  41. MegDNNError);
  42. }
  43. TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
  44. if (cuda::current_device_prop().major < 7 ||
  45. (cuda::current_device_prop().major == 7 &&
  46. cuda::current_device_prop().minor < 5)) {
  47. printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device doesn't support\n");
  48. return;
  49. }
  50. Checker<MatrixMul> checker(handle_cuda(), false);
  51. using Param = MatrixMul::Param;
  52. Param param;
  53. param.transposeB = true;
  54. checker.set_param(param);
  55. checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  56. checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
  57. checker.set_dtype(2, dtype::QuantizedS32(1.3f*1.3f));
  58. checker.exec({{256, 256}, {256, 256}, {256, 256}});
  59. auto args = matrix_mul::get_matmul_args();
  60. for (auto arg : args) {
  61. size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8,
  62. k = DIVUP(arg.k, 32) * 32;
  63. checker.exec({{m, k}, {n, k}, {m, n}});
  64. }
  65. }
  66. #if MEGDNN_WITH_BENCHMARK
  67. TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  68. if (cuda::current_device_prop().major < 7 ||
  69. (cuda::current_device_prop().major == 7 &&
  70. cuda::current_device_prop().minor < 5)) {
  71. printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current "
  72. "device doesn't support\n");
  73. return;
  74. }
  75. Benchmarker<MatrixMul> bencher(handle_cuda());
  76. using Param = MatrixMul::Param;
  77. Param param;
  78. param.transposeB = true;
  79. bencher.set_param(param);
  80. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  81. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  82. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  83. for (size_t m : {256, 1024, 4096, 10240, 40960}) {
  84. for (size_t n : {256, 1024, 4096}) {
  85. for (size_t k :{512, 1024, 2048}) {
  86. bencher.set_times(400);
  87. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  88. auto gflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  89. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n",
  90. m, k, n, time_in_ms, gflps);
  91. }
  92. }
  93. }
  94. }
  95. TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
  96. if (cuda::current_device_prop().major < 7 ||
  97. (cuda::current_device_prop().major == 7 &&
  98. cuda::current_device_prop().minor < 5)) {
  99. printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as "
  100. "current "
  101. "device doesn't support\n");
  102. return;
  103. }
  104. Benchmarker<MatrixMul> bencher(handle_cuda());
  105. using Param = MatrixMul::Param;
  106. Param param;
  107. param.transposeB = true;
  108. bencher.set_param(param);
  109. bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  110. bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
  111. bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
  112. bencher.set_times(400);
  113. size_t m = 4096, n = 4096, k = 81920;
  114. auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
  115. auto tflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
  116. printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n,
  117. time_in_ms, tflps);
  118. }
  119. #endif
  120. #endif
  121. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_WITH_SPETIAL_STRIDES) {
  122. if (cuda::current_device_prop().major < 6) {
  123. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  124. return;
  125. }
  126. Checker<MatrixMul> checker(handle_cuda());
  127. using Param = MatrixMul::Param;
  128. Param param;
  129. DType stype = dtype::Int8();
  130. checker.set_param(param)
  131. .set_dtype(0, stype)
  132. .set_dtype(1, stype)
  133. .set_dtype(2, dtype::Int32())
  134. .set_epsilon(5e-3);
  135. size_t m = 1024, n = 1024, k = 1024;
  136. {
  137. TensorLayout A{{m, k}, {2048, 1}, dtype::Int8()},
  138. B{{k, n}, {2048, 1}, dtype::Int8()}, C{{m, n}, dtype::Int32()};
  139. checker.execl({A, B, {}});
  140. }
  141. }
  142. TEST_F(CUDA, MATRIX_MUL_INT8x8x32_NAIVE) {
  143. if (cuda::current_device_prop().major < 6) {
  144. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  145. return;
  146. }
  147. using Param = MatrixMul::Param;
  148. UniformIntRNG rng{-128, 127};
  149. Checker<MatrixMul> checker(handle_cuda());
  150. checker.set_rng(0, &rng).set_rng(1, &rng);
  151. size_t m = 1007, n = 1003, k = 129;
  152. for (unsigned mask = 0; mask < 4; ++mask) {
  153. Param param;
  154. param.transposeA = mask & 1;
  155. param.transposeB = mask & 2;
  156. TensorShape A, B;
  157. if (param.transposeA)
  158. A = TensorShape{k, m};
  159. else
  160. A = TensorShape{m, k};
  161. if (param.transposeB)
  162. B = TensorShape{n, k};
  163. else
  164. B = TensorShape{k, n};
  165. checker.set_param(param)
  166. .set_dtype(0, dtype::Int8())
  167. .set_dtype(1, dtype::Int8())
  168. .set_dtype(2, dtype::Int32())
  169. .set_epsilon(0)
  170. .execs({A, B, {}});
  171. }
  172. }
  173. TEST_F(CUDA, MATRIX_MUL)
  174. {
  175. if (cuda::current_device_prop().major < 6) {
  176. printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
  177. return;
  178. }
  179. Checker<MatrixMul> checker(handle_cuda());
  180. using Param = MatrixMul::Param;
  181. size_t m = 12, n = 16, k = 20;
  182. for (DType dtype: std::array<DType, 3>{
  183. {dtype::Float32(), dtype::Float16(), dtype::Int32()}}) {
  184. for (unsigned mask = 0; mask < 4; ++mask) {
  185. Param param;
  186. param.transposeA = mask & 1;
  187. param.transposeB = mask & 2;
  188. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  189. TensorShape A, B;
  190. if (param.transposeA)
  191. A = TensorShape{k, m};
  192. else
  193. A = TensorShape{m, k};
  194. if (param.transposeB)
  195. B = TensorShape{n, k};
  196. else
  197. B = TensorShape{k, n};
  198. checker.set_param(param).
  199. set_dtype(0, stype).
  200. set_dtype(1, stype).
  201. set_dtype(2, dtype).
  202. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
  203. execs({A, B, {}});
  204. }
  205. }
  206. // general tests
  207. auto args = matrix_mul::get_matmul_args();
  208. for (auto arg: args) {
  209. auto m = arg.m, n = arg.n, k = arg.k;
  210. auto mask = arg.mask;
  211. Param param;
  212. param.transposeA = mask & 1;
  213. param.transposeB = mask & 2;
  214. TensorShape AS, BS, CS;
  215. if (param.transposeA)
  216. AS = TensorShape{k, m};
  217. else
  218. AS = TensorShape{m, k};
  219. if (param.transposeB)
  220. BS = TensorShape{n, k};
  221. else
  222. BS = TensorShape{k, n};
  223. CS = TensorShape{m, n};
  224. TensorLayout AL, BL, CL;
  225. if (arg.A_stride == 0) {
  226. AL = TensorLayout(AS, dtype::Float32());
  227. } else {
  228. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
  229. dtype::Float32());
  230. }
  231. if (arg.B_stride == 0) {
  232. BL = TensorLayout(BS, dtype::Float32());
  233. } else {
  234. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
  235. dtype::Float32());
  236. }
  237. if (arg.C_stride == 0) {
  238. CL = TensorLayout(CS, dtype::Float32());
  239. } else {
  240. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
  241. dtype::Float32());
  242. }
  243. checker.set_param(param).execl({AL, BL, CL});
  244. }
  245. }
  246. TEST_F(CUDA, MATRIX_MUL_CUBLASLT)
  247. {
  248. require_compute_capability(7, 5);
  249. NormalRNG normal_rng;
  250. Checker<MatrixMul> checker(handle_cuda());
  251. checker.set_rng(0, &normal_rng)
  252. .set_rng(1, &normal_rng)
  253. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  254. using Param = MatrixMul::Param;
  255. size_t m = 32, n = 32, k = 32;
  256. // test Int8 matmul
  257. {
  258. DType dtype=dtype::Int32();
  259. Param param;
  260. param.transposeA = false;
  261. param.transposeB = false;
  262. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  263. TensorShape A, B;
  264. A = TensorShape{m, k};
  265. B = TensorShape{k, n};
  266. checker.set_param(param).
  267. set_dtype(0, stype).
  268. set_dtype(1, stype).
  269. set_dtype(2, dtype).
  270. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
  271. execs({A, B, {}});
  272. }
  273. // test float-point matmul
  274. for (DType dtype: std::array<DType, 2>{
  275. {dtype::Float32(), dtype::Float16()}}) {
  276. for (unsigned mask = 0; mask < 4; ++mask) {
  277. Param param;
  278. param.transposeA = mask & 1;
  279. param.transposeB = mask & 2;
  280. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  281. TensorShape A, B;
  282. if (param.transposeA)
  283. A = TensorShape{k, m};
  284. else
  285. A = TensorShape{m, k};
  286. if (param.transposeB)
  287. B = TensorShape{n, k};
  288. else
  289. B = TensorShape{k, n};
  290. checker.set_param(param).
  291. set_dtype(0, stype).
  292. set_dtype(1, stype).
  293. set_dtype(2, dtype).
  294. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 8e-3).
  295. execs({A, B, {}});
  296. }
  297. }
  298. // general tests
  299. auto args = matrix_mul::get_matmul_args();
  300. for (auto arg: args) {
  301. auto m = arg.m, n = arg.n, k = arg.k;
  302. auto mask = arg.mask;
  303. Param param;
  304. param.transposeA = mask & 1;
  305. param.transposeB = mask & 2;
  306. TensorShape AS, BS, CS;
  307. if (param.transposeA)
  308. AS = TensorShape{k, m};
  309. else
  310. AS = TensorShape{m, k};
  311. if (param.transposeB)
  312. BS = TensorShape{n, k};
  313. else
  314. BS = TensorShape{k, n};
  315. CS = TensorShape{m, n};
  316. TensorLayout AL, BL, CL;
  317. if (arg.A_stride == 0) {
  318. AL = TensorLayout(AS, dtype::Float32());
  319. } else {
  320. AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
  321. dtype::Float32());
  322. }
  323. if (arg.B_stride == 0) {
  324. BL = TensorLayout(BS, dtype::Float32());
  325. } else {
  326. BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
  327. dtype::Float32());
  328. }
  329. if (arg.C_stride == 0) {
  330. CL = TensorLayout(CS, dtype::Float32());
  331. } else {
  332. CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
  333. dtype::Float32());
  334. }
  335. checker.set_param(param).execl({AL, BL, CL});
  336. }
  337. }
  338. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_SPECIAL_CASE) {
  339. require_compute_capability(7, 5);
  340. size_t m = 12, n = 16, k = 20;
  341. Checker<MatrixMul> checker(handle_cuda());
  342. checker.set_before_exec_callback(
  343. AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  344. using Param = MatrixMul::Param;
  345. Param param;
  346. DType stype = dtype::Float32();
  347. DType dtype = dtype::Float32();
  348. TensorShape A, B;
  349. param.transposeA=param.transposeB=1;
  350. if (param.transposeA)
  351. A = TensorShape{k, m};
  352. else
  353. A = TensorShape{m, k};
  354. if (param.transposeB)
  355. B = TensorShape{n, k};
  356. else
  357. B = TensorShape{k, n};
  358. checker.set_param(param).
  359. set_dtype(0, stype).
  360. set_dtype(1, stype).
  361. set_dtype(2, dtype).
  362. set_epsilon(dtype == dtype::Float16() ? 5e-1 : 5e-2).
  363. execs({A, B, {}});
  364. }
  365. TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
  366. require_compute_capability(7, 5);
  367. NormalRNG normal_rng;
  368. Checker<MatrixMul> checker(handle_cuda());
  369. checker.set_rng(0, &normal_rng)
  370. .set_rng(1, &normal_rng)
  371. .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
  372. using Param = MatrixMul::Param;
  373. //size_t m = 32, n = 32, k = 32;
  374. // test Int8 matmul
  375. for (size_t m=8; m<=64; m+=4)
  376. for (size_t n=8; n<=64; n+=4)
  377. for (size_t k=8; k<=64; k+=4)
  378. {
  379. DType dtype=dtype::Int32();
  380. Param param;
  381. param.transposeA = false;
  382. param.transposeB = false;
  383. DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
  384. TensorShape A, B;
  385. A = TensorShape{m, k};
  386. B = TensorShape{k, n};
  387. checker.set_param(param).
  388. set_dtype(0, stype).
  389. set_dtype(1, stype).
  390. set_dtype(2, dtype).
  391. set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
  392. execs({A, B, {}});
  393. }
  394. }
  395. } // namespace test
  396. } // namespace megdnn
  397. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台