You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout_format.cpp 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. /**
  2. * \file dnn/test/cuda/relayout_format.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "megdnn/oprs.h"
  14. #include "test/cuda/benchmark.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/rng.h"
  17. #include "test/cuda/fixture.h"
  18. using namespace megdnn;
  19. using namespace test;
  20. TEST_F(CUDA, RELAYOUT_FORMAT) {
  21. Checker<RelayoutFormat> checker(handle_cuda());
  22. UniformIntRNG rng{-50, 50};
  23. param::RelayoutFormat param;
  24. param.mode = param::RelayoutFormat::Mode::NCHW4_CHWN4;
  25. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  26. .set_dtype(1, dtype::QuantizedS8{0.1f})
  27. .set_rng(0, &rng)
  28. .set_param(param)
  29. .execs({{22, 23, 24, 25, 4}, {}});
  30. param.mode = param::RelayoutFormat::Mode::CHWN4_NCHW4;
  31. checker.execs({{22, 23, 24, 25, 4}, {}});
  32. }
  33. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4_NCHW) {
  34. Checker<RelayoutFormat> checker(handle_cuda());
  35. UniformIntRNG rng{-50, 50};
  36. param::RelayoutFormat param;
  37. param.mode = param::RelayoutFormat::Mode::NCHW4_NCHW;
  38. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  39. .set_dtype(1, dtype::QuantizedS8{0.1f})
  40. .set_rng(0, &rng)
  41. .set_param(param)
  42. .execs({{1, 1, 2, 2, 4}, {}});
  43. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  44. .set_dtype(1, dtype::QuantizedS8{0.1f})
  45. .set_rng(0, &rng)
  46. .set_param(param)
  47. .execs({{22, 23, 24, 25, 4}, {}});
  48. param.oc = 90;
  49. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  50. .set_dtype(1, dtype::QuantizedS8{0.1f})
  51. .set_rng(0, &rng)
  52. .set_param(param)
  53. .execs({{22, 23, 24, 25, 4}, {}});
  54. param.oc = 16;
  55. param.group = 8;
  56. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  57. .set_dtype(1, dtype::QuantizedS8{0.1f})
  58. .set_rng(0, &rng)
  59. .set_param(param)
  60. .execs({{11, 16, 22, 33, 4}, {}});
  61. }
  62. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4) {
  63. Checker<RelayoutFormat> checker(handle_cuda());
  64. UniformIntRNG rng{-50, 50};
  65. param::RelayoutFormat param;
  66. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  67. for (size_t n : {1, 3}) {
  68. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  69. for (size_t h : {3, 7, 12, 16, 22, 59, 83}) {
  70. for (size_t w : {3, 22, 63, 128, 256}) {
  71. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  72. .set_dtype(1, dtype::QuantizedS8{1.f})
  73. .set_rng(0, &rng)
  74. .set_param(param)
  75. .execs({{n, c, h, w}, {}});
  76. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  77. .set_dtype(1, dtype::QuantizedS8{2.f})
  78. .set_rng(0, &rng)
  79. .set_param(param)
  80. .execs({{n, c, h, w}, {}});
  81. checker.set_dtype(0, dtype::QuantizedS32{1.f})
  82. .set_dtype(1, dtype::QuantizedS32{1.f})
  83. .set_rng(0, &rng)
  84. .set_param(param)
  85. .execs({{n, c, h, w}, {}});
  86. }
  87. }
  88. }
  89. }
  90. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  91. .set_dtype(1, dtype::QuantizedS8{1.f})
  92. .set_rng(0, &rng)
  93. .set_param(param)
  94. .execs({{8, 3, 224, 224}, {}});
  95. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  96. .set_dtype(1, dtype::QuantizedS8{1.f})
  97. .set_rng(0, &rng)
  98. .set_param(param)
  99. .execs({{8, 3, 600, 600}, {}});
  100. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  101. .set_dtype(1, dtype::QuantizedS8{1.f})
  102. .set_rng(0, &rng)
  103. .set_param(param)
  104. .execs({{1, 6, 768, 1280}, {}});
  105. param.group = 2;
  106. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  107. .set_dtype(1, dtype::QuantizedS8{1.f})
  108. .set_rng(0, &rng)
  109. .set_param(param)
  110. .execs({{8, 6, 300, 300}, {}});
  111. param.group = 3;
  112. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  113. .set_dtype(1, dtype::QuantizedS8{1.f})
  114. .set_rng(0, &rng)
  115. .set_param(param)
  116. .execs({{8, 6, 300, 300}, {}});
  117. }
  118. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_WEIGHT) {
  119. Checker<RelayoutFormat> checker(handle_cuda());
  120. UniformIntRNG rng{-50, 50};
  121. param::RelayoutFormat param;
  122. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_WEIGHT;
  123. for (size_t oc : {1, 3, 4, 16, 33}) {
  124. for (size_t ic : {1, 2, 3, 4, 8, 9, 11, 16, 33}) {
  125. for (size_t h : {3, 5, 7}) {
  126. for (size_t w : {3, 5, 7}) {
  127. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  128. .set_dtype(1, dtype::QuantizedS8{1.f})
  129. .set_rng(0, &rng)
  130. .set_param(param)
  131. .execs({{oc, ic, h, w}, {}});
  132. }
  133. }
  134. }
  135. }
  136. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  137. .set_dtype(1, dtype::QuantizedS8{1.f})
  138. .set_rng(0, &rng)
  139. .set_param(param)
  140. .execs({{13, 13, 5, 5}, {}});
  141. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  142. .set_dtype(1, dtype::QuantizedS8{1.f})
  143. .set_rng(0, &rng)
  144. .set_param(param)
  145. .execs({{4, 16, 16, 3, 3}, {}});
  146. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  147. .set_dtype(1, dtype::QuantizedS8{1.f})
  148. .set_rng(0, &rng)
  149. .set_param(param)
  150. .execs({{4, 13, 11, 3, 3}, {}});
  151. }
  152. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_DEFAULT) {
  153. Checker<RelayoutFormat> checker(handle_cuda());
  154. UniformIntRNG rng{0, 50};
  155. param::RelayoutFormat param;
  156. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  157. for (size_t n : {1, 3}) {
  158. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  159. for (size_t h : {3, 7, 12, 16, 59, 83}) {
  160. for (size_t w : {3, 63, 128, 256}) {
  161. checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
  162. .set_dtype(1, dtype::QuantizedS8{1.f})
  163. .set_rng(0, &rng)
  164. .set_param(param)
  165. .execs({{n, c, h, w}, {}});
  166. }
  167. }
  168. }
  169. }
  170. }
  171. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_U8) {
  172. Checker<RelayoutFormat> checker(handle_cuda());
  173. UniformIntRNG rng{0, 255};
  174. param::RelayoutFormat param;
  175. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  176. for (size_t n : {1, 3}) {
  177. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  178. for (size_t h : {3, 7, 12, 16, 59, 83}) {
  179. for (size_t w : {3, 13, 3 * 4, 63 * 4, 128 * 4, 256 * 4}) {
  180. checker.set_dtype(0, dtype::Uint8())
  181. .set_dtype(1, dtype::QuantizedS8{1.f})
  182. .set_rng(0, &rng)
  183. .set_param(param)
  184. .execs({{n, c, h, w}, {}});
  185. checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
  186. .set_dtype(1, dtype::QuantizedS8{1.f})
  187. .set_rng(0, &rng)
  188. .set_param(param)
  189. .execs({{n, c, h, w}, {}});
  190. checker.set_dtype(0, dtype::Uint8())
  191. .set_dtype(1, dtype::QuantizedS8{2.5f})
  192. .set_rng(0, &rng)
  193. .set_param(param)
  194. .execs({{n, c, h, w}, {}});
  195. }
  196. }
  197. }
  198. }
  199. }
  200. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_IC_SMALL) {
  201. Checker<RelayoutFormat> checker(handle_cuda());
  202. UniformIntRNG rng{0, 50};
  203. param::RelayoutFormat param;
  204. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  205. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  206. .set_dtype(1, dtype::QuantizedS8{1.f})
  207. .set_rng(0, &rng)
  208. .set_param(param)
  209. .execs({{8, 3, 768, 1280}, {}});
  210. }
  211. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW64) {
  212. Checker<RelayoutFormat> checker(handle_cuda());
  213. UniformIntRNG s4{-8, 7};
  214. UniformIntRNG u4{0, 15};
  215. param::RelayoutFormat param;
  216. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW64;
  217. for (size_t n : {1, 3}) {
  218. for (size_t c : {15, 64, 128}) {
  219. for (size_t h : {7, 14, 16, 28}) {
  220. for (size_t w : {2, 3, 7, 8, 16, 31}) {
  221. checker.set_dtype(0, dtype::QuantizedS4{2.f})
  222. .set_dtype(1, dtype::QuantizedS4{2.f})
  223. .set_rng(0, &s4)
  224. .set_param(param)
  225. .execs({{n, c, h, w}, {}});
  226. checker.set_dtype(0, dtype::Quantized4Asymm{1.2f, 8})
  227. .set_dtype(1, dtype::Quantized4Asymm{1.2f, 4})
  228. .set_rng(0, &u4)
  229. .set_param(param)
  230. .execs({{n, c, h, w}, {}});
  231. checker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  232. .set_dtype(1, dtype::QuantizedS4{1.f})
  233. .set_rng(0, &s4)
  234. .set_param(param)
  235. .execs({{n, c, h, w}, {}});
  236. checker.set_dtype(0, dtype::Quantized4Asymm{1.19990307f, 8})
  237. .set_dtype(1, dtype::Quantized4Asymm{1.f, 4})
  238. .set_rng(0, &u4)
  239. .set_param(param)
  240. .set_epsilon(1e-3)
  241. .execs({{n, c, h, w}, {}});
  242. }
  243. }
  244. }
  245. }
  246. }
  247. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW64_NCHW) {
  248. Checker<RelayoutFormat> checker(handle_cuda());
  249. UniformIntRNG s4{-8, 7};
  250. UniformIntRNG u4{0, 15};
  251. param::RelayoutFormat param;
  252. param.mode = param::RelayoutFormat::Mode::NCHW64_NCHW;
  253. for (size_t n : {1, 3}) {
  254. for (size_t c : {15, 64, 128}) {
  255. for (size_t h : {7, 14, 16, 28}) {
  256. for (size_t w : {2, 3, 4, 7, 14, 16, 17}) {
  257. if (c % 64 != 0) {
  258. param.oc = c;
  259. } else {
  260. param.oc = 0;
  261. }
  262. checker.set_dtype(0, dtype::QuantizedS4{2.f})
  263. .set_dtype(1, dtype::QuantizedS4{2.f})
  264. .set_rng(0, &s4)
  265. .set_param(param)
  266. .set_epsilon(1e-3)
  267. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  268. checker.set_dtype(0, dtype::Quantized4Asymm{1.2f, 4})
  269. .set_dtype(1, dtype::Quantized4Asymm{1.2f, 8})
  270. .set_rng(0, &u4)
  271. .set_param(param)
  272. .set_epsilon(1e-3)
  273. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  274. checker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  275. .set_dtype(1, dtype::QuantizedS4{1.f})
  276. .set_rng(0, &s4)
  277. .set_param(param)
  278. .set_epsilon(1e-3)
  279. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  280. checker.set_dtype(0, dtype::Quantized4Asymm{1.20211209f, 8})
  281. .set_dtype(1, dtype::Quantized4Asymm{1.f, 4})
  282. .set_rng(0, &u4)
  283. .set_param(param)
  284. .set_epsilon(1e-3)
  285. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  286. }
  287. }
  288. }
  289. }
  290. }
  291. #if MEGDNN_WITH_BENCHMARK
  292. TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) {
  293. using Param = RelayoutFormat::Param;
  294. auto run = [&](const TensorShapeArray& shapes, Param param,
  295. Param default_param) {
  296. Benchmarker<RelayoutFormat> benchmarker(handle_cuda());
  297. benchmarker.set_param(param);
  298. benchmarker.set_dtype(0, dtype::QuantizedS8{1.f})
  299. .set_dtype(1, dtype::QuantizedS8{1.f});
  300. Benchmarker<RelayoutFormat> benchmarker_default(handle_cuda());
  301. benchmarker_default.set_param(default_param);
  302. benchmarker_default.set_dtype(0, dtype::QuantizedS8{1.f})
  303. .set_dtype(1, dtype::QuantizedS8{1.f});
  304. for (auto&& shape : shapes) {
  305. double memaccess = (double(shape.total_nr_elems()) +
  306. double(shape[0]) * ((shape[1] + 3) / 4 * 4) *
  307. shape[2] * shape[3]) *
  308. 1e-6;
  309. auto time_ms = benchmarker.execs({shape, {}});
  310. if (shape[1] <= 4) {
  311. auto time_default_ms = benchmarker_default.execs({shape, {}});
  312. printf("execute %s, time %.4f ms, %.4f GB/s, default %.4f "
  313. "GB/s\n",
  314. shape.to_string().c_str(), time_ms, memaccess / time_ms,
  315. memaccess / time_default_ms);
  316. } else {
  317. printf("execute %s, time %.4f ms, %.4f GB/s\n",
  318. shape.to_string().c_str(), time_ms, memaccess / time_ms);
  319. }
  320. }
  321. };
  322. TensorShapeArray shapes = {
  323. {8, 1, 768, 1280}, {8, 3, 768, 1280}, {8, 3, 224, 224},
  324. {8, 4, 768, 1280}, {64, 3, 768, 1280},
  325. };
  326. {
  327. Param param;
  328. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  329. Param default_param;
  330. default_param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  331. run(shapes, param, default_param);
  332. }
  333. }
  334. TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT_QS4) {
  335. using Param = RelayoutFormat::Param;
  336. auto run = [&](const TensorShapeArray& shapes, Param param) {
  337. CUBenchmarker<RelayoutFormat> benchmarker(handle_cuda());
  338. benchmarker.set_param(param);
  339. benchmarker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  340. .set_dtype(1, dtype::QuantizedS4{1.19990307f});
  341. for (auto&& shape : shapes) {
  342. double memaccess =
  343. double(TensorLayout(shape, dtype::QuantizedS4{1.f})
  344. .span()
  345. .dist_byte()) *
  346. 2e-6;
  347. auto time_ms = benchmarker.execs({shape, {}});
  348. printf("execute %s, time %.4f ms, %.4f GB/s\n",
  349. shape.to_string().c_str(), time_ms, memaccess / time_ms);
  350. }
  351. };
  352. {
  353. TensorShapeArray shapes = {
  354. {1, 64, 56, 56}, {16, 64, 56, 56}, {64, 64, 56, 56},
  355. {1, 64, 56, 55}, {16, 64, 56, 55}, {64, 64, 56, 55},
  356. {1, 256, 384, 640},
  357. };
  358. Param param;
  359. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW64;
  360. run(shapes, param);
  361. }
  362. {
  363. TensorShapeArray shapes = {
  364. {64, 1, 56, 56, 64},
  365. {1, 32, 7, 7, 64},
  366. {16, 32, 7, 7, 64},
  367. {64, 32, 7, 7, 64},
  368. {1, 4, 384, 640, 64},
  369. };
  370. Param param;
  371. param.mode = param::RelayoutFormat::Mode::NCHW64_NCHW;
  372. run(shapes, param);
  373. }
  374. }
  375. #endif
  376. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {
  377. Checker<RelayoutFormat> checker(handle_cuda());
  378. UniformIntRNG rng{-50, 50};
  379. param::RelayoutFormat param;
  380. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  381. for (DType dtype :
  382. std::vector<DType>({dtype::QuantizedS8{0.1f}, dtype::Float32{}})) {
  383. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_rng(0, &rng);
  384. checker.set_param(param).execs({{2, 4, 35, 36}, {}});
  385. checker.set_param(param).execs({{2, 3, 35, 36}, {}});
  386. checker.set_param(param).execs({{2, 1, 35, 36}, {}});
  387. param.mode = param::RelayoutFormat::Mode::
  388. NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT;
  389. checker.set_param(param).execs({{4, 3, 3, 3}, {}});
  390. checker.set_param(param).execs({{4, 4, 3, 3}, {}});
  391. checker.set_param(param).execs({{1, 4, 3, 3}, {}});
  392. }
  393. }
  394. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台