You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout_format.cpp 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. /**
  2. * \file dnn/test/cuda/relayout_format.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "megdnn/oprs.h"
  14. #include "test/cuda/benchmark.h"
  15. #include "test/common/checker.h"
  16. #include "test/common/rng.h"
  17. #include "test/cuda/fixture.h"
  18. using namespace megdnn;
  19. using namespace test;
  20. TEST_F(CUDA, RELAYOUT_FORMAT) {
  21. Checker<RelayoutFormat> checker(handle_cuda());
  22. UniformIntRNG rng{-50, 50};
  23. param::RelayoutFormat param;
  24. param.mode = param::RelayoutFormat::Mode::NCHW4_CHWN4;
  25. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  26. .set_dtype(1, dtype::QuantizedS8{0.1f})
  27. .set_rng(0, &rng)
  28. .set_param(param)
  29. .execs({{22, 23, 24, 25, 4}, {}});
  30. param.mode = param::RelayoutFormat::Mode::CHWN4_NCHW4;
  31. checker.execs({{22, 23, 24, 25, 4}, {}});
  32. }
  33. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4_NCHW) {
  34. Checker<RelayoutFormat> checker(handle_cuda());
  35. UniformIntRNG rng{-50, 50};
  36. UniformIntRNG u8_rng{0, 255};
  37. param::RelayoutFormat param;
  38. param.mode = param::RelayoutFormat::Mode::NCHW4_NCHW;
  39. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  40. .set_dtype(1, dtype::QuantizedS8{0.1f})
  41. .set_rng(0, &rng)
  42. .set_param(param)
  43. .execs({{1, 1, 2, 2, 4}, {}});
  44. checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
  45. .set_dtype(1, dtype::Quantized8Asymm{1.f, 128})
  46. .set_rng(0, &u8_rng)
  47. .set_param(param)
  48. .execs({{1, 1, 2, 2, 4}, {}});
  49. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  50. .set_dtype(1, dtype::QuantizedS8{0.1f})
  51. .set_rng(0, &rng)
  52. .set_param(param)
  53. .execs({{22, 23, 24, 25, 4}, {}});
  54. param.oc = 90;
  55. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  56. .set_dtype(1, dtype::QuantizedS8{0.1f})
  57. .set_rng(0, &rng)
  58. .set_param(param)
  59. .execs({{22, 23, 24, 25, 4}, {}});
  60. param.oc = 16;
  61. param.group = 8;
  62. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  63. .set_dtype(1, dtype::QuantizedS8{0.1f})
  64. .set_rng(0, &rng)
  65. .set_param(param)
  66. .execs({{11, 16, 22, 33, 4}, {}});
  67. }
  68. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4) {
  69. Checker<RelayoutFormat> checker(handle_cuda());
  70. UniformIntRNG rng{-50, 50};
  71. param::RelayoutFormat param;
  72. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  73. for (size_t n : {1, 3}) {
  74. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  75. for (size_t h : {3, 7, 12, 16, 22, 59, 83}) {
  76. for (size_t w : {3, 22, 63, 128, 256}) {
  77. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  78. .set_dtype(1, dtype::QuantizedS8{1.f})
  79. .set_rng(0, &rng)
  80. .set_param(param)
  81. .execs({{n, c, h, w}, {}});
  82. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  83. .set_dtype(1, dtype::QuantizedS8{2.f})
  84. .set_rng(0, &rng)
  85. .set_param(param)
  86. .execs({{n, c, h, w}, {}});
  87. checker.set_dtype(0, dtype::QuantizedS32{1.f})
  88. .set_dtype(1, dtype::QuantizedS32{1.f})
  89. .set_rng(0, &rng)
  90. .set_param(param)
  91. .execs({{n, c, h, w}, {}});
  92. }
  93. }
  94. }
  95. }
  96. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  97. .set_dtype(1, dtype::QuantizedS8{1.f})
  98. .set_rng(0, &rng)
  99. .set_param(param)
  100. .execs({{8, 3, 224, 224}, {}});
  101. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  102. .set_dtype(1, dtype::QuantizedS8{1.f})
  103. .set_rng(0, &rng)
  104. .set_param(param)
  105. .execs({{8, 3, 600, 600}, {}});
  106. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  107. .set_dtype(1, dtype::QuantizedS8{1.f})
  108. .set_rng(0, &rng)
  109. .set_param(param)
  110. .execs({{1, 6, 768, 1280}, {}});
  111. param.group = 2;
  112. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  113. .set_dtype(1, dtype::QuantizedS8{1.f})
  114. .set_rng(0, &rng)
  115. .set_param(param)
  116. .execs({{8, 6, 300, 300}, {}});
  117. param.group = 3;
  118. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  119. .set_dtype(1, dtype::QuantizedS8{1.f})
  120. .set_rng(0, &rng)
  121. .set_param(param)
  122. .execs({{8, 6, 300, 300}, {}});
  123. }
  124. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_WEIGHT) {
  125. Checker<RelayoutFormat> checker(handle_cuda());
  126. UniformIntRNG rng{-50, 50};
  127. param::RelayoutFormat param;
  128. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_WEIGHT;
  129. for (size_t oc : {1, 3, 4, 16, 33}) {
  130. for (size_t ic : {1, 2, 3, 4, 8, 9, 11, 16, 33}) {
  131. for (size_t h : {3, 5, 7}) {
  132. for (size_t w : {3, 5, 7}) {
  133. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  134. .set_dtype(1, dtype::QuantizedS8{1.f})
  135. .set_rng(0, &rng)
  136. .set_param(param)
  137. .execs({{oc, ic, h, w}, {}});
  138. }
  139. }
  140. }
  141. }
  142. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  143. .set_dtype(1, dtype::QuantizedS8{1.f})
  144. .set_rng(0, &rng)
  145. .set_param(param)
  146. .execs({{13, 13, 5, 5}, {}});
  147. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  148. .set_dtype(1, dtype::QuantizedS8{1.f})
  149. .set_rng(0, &rng)
  150. .set_param(param)
  151. .execs({{4, 16, 16, 3, 3}, {}});
  152. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  153. .set_dtype(1, dtype::QuantizedS8{1.f})
  154. .set_rng(0, &rng)
  155. .set_param(param)
  156. .execs({{4, 13, 11, 3, 3}, {}});
  157. }
  158. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_DEFAULT) {
  159. Checker<RelayoutFormat> checker(handle_cuda());
  160. UniformIntRNG rng{0, 50};
  161. param::RelayoutFormat param;
  162. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  163. for (size_t n : {1, 3}) {
  164. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  165. for (size_t h : {3, 7, 12, 16, 59, 83}) {
  166. for (size_t w : {3, 63, 128, 256}) {
  167. checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
  168. .set_dtype(1, dtype::QuantizedS8{1.f})
  169. .set_rng(0, &rng)
  170. .set_param(param)
  171. .execs({{n, c, h, w}, {}});
  172. }
  173. }
  174. }
  175. }
  176. }
  177. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_U8) {
  178. Checker<RelayoutFormat> checker(handle_cuda());
  179. UniformIntRNG rng{0, 255};
  180. param::RelayoutFormat param;
  181. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  182. for (size_t n : {1, 3}) {
  183. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  184. for (size_t h : {3, 7, 12, 16, 59, 83}) {
  185. for (size_t w : {3, 13, 3 * 4, 63 * 4, 128 * 4, 256 * 4}) {
  186. checker.set_dtype(0, dtype::Uint8())
  187. .set_dtype(1, dtype::QuantizedS8{1.f})
  188. .set_rng(0, &rng)
  189. .set_param(param)
  190. .execs({{n, c, h, w}, {}});
  191. checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
  192. .set_dtype(1, dtype::QuantizedS8{1.f})
  193. .set_rng(0, &rng)
  194. .set_param(param)
  195. .execs({{n, c, h, w}, {}});
  196. checker.set_dtype(0, dtype::Uint8())
  197. .set_dtype(1, dtype::QuantizedS8{2.5f})
  198. .set_rng(0, &rng)
  199. .set_param(param)
  200. .execs({{n, c, h, w}, {}});
  201. }
  202. }
  203. }
  204. }
  205. }
  206. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_IC_SMALL) {
  207. Checker<RelayoutFormat> checker(handle_cuda());
  208. UniformIntRNG rng{0, 50};
  209. param::RelayoutFormat param;
  210. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  211. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  212. .set_dtype(1, dtype::QuantizedS8{1.f})
  213. .set_rng(0, &rng)
  214. .set_param(param)
  215. .execs({{8, 3, 768, 1280}, {}});
  216. }
  217. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW64) {
  218. Checker<RelayoutFormat> checker(handle_cuda());
  219. UniformIntRNG s4{-8, 7};
  220. UniformIntRNG u4{0, 15};
  221. param::RelayoutFormat param;
  222. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW64;
  223. for (size_t n : {1, 3}) {
  224. for (size_t c : {15, 64, 128}) {
  225. for (size_t h : {7, 14, 16, 28}) {
  226. for (size_t w : {2, 3, 7, 8, 16, 31}) {
  227. checker.set_dtype(0, dtype::QuantizedS4{2.f})
  228. .set_dtype(1, dtype::QuantizedS4{2.f})
  229. .set_rng(0, &s4)
  230. .set_param(param)
  231. .execs({{n, c, h, w}, {}});
  232. checker.set_dtype(0, dtype::Quantized4Asymm{1.2f, 8})
  233. .set_dtype(1, dtype::Quantized4Asymm{1.2f, 4})
  234. .set_rng(0, &u4)
  235. .set_param(param)
  236. .execs({{n, c, h, w}, {}});
  237. checker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  238. .set_dtype(1, dtype::QuantizedS4{1.f})
  239. .set_rng(0, &s4)
  240. .set_param(param)
  241. .execs({{n, c, h, w}, {}});
  242. checker.set_dtype(0, dtype::Quantized4Asymm{1.19990307f, 8})
  243. .set_dtype(1, dtype::Quantized4Asymm{1.f, 4})
  244. .set_rng(0, &u4)
  245. .set_param(param)
  246. .set_epsilon(1e-3)
  247. .execs({{n, c, h, w}, {}});
  248. }
  249. }
  250. }
  251. }
  252. }
  253. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW64_NCHW) {
  254. Checker<RelayoutFormat> checker(handle_cuda());
  255. UniformIntRNG s4{-8, 7};
  256. UniformIntRNG u4{0, 15};
  257. param::RelayoutFormat param;
  258. param.mode = param::RelayoutFormat::Mode::NCHW64_NCHW;
  259. for (size_t n : {1, 3}) {
  260. for (size_t c : {15, 64, 128}) {
  261. for (size_t h : {7, 14, 16, 28}) {
  262. for (size_t w : {2, 3, 4, 7, 14, 16, 17}) {
  263. if (c % 64 != 0) {
  264. param.oc = c;
  265. } else {
  266. param.oc = 0;
  267. }
  268. checker.set_dtype(0, dtype::QuantizedS4{2.f})
  269. .set_dtype(1, dtype::QuantizedS4{2.f})
  270. .set_rng(0, &s4)
  271. .set_param(param)
  272. .set_epsilon(1e-3)
  273. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  274. checker.set_dtype(0, dtype::Quantized4Asymm{1.2f, 4})
  275. .set_dtype(1, dtype::Quantized4Asymm{1.2f, 8})
  276. .set_rng(0, &u4)
  277. .set_param(param)
  278. .set_epsilon(1e-3)
  279. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  280. checker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  281. .set_dtype(1, dtype::QuantizedS4{1.f})
  282. .set_rng(0, &s4)
  283. .set_param(param)
  284. .set_epsilon(1e-3)
  285. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  286. checker.set_dtype(0, dtype::Quantized4Asymm{1.20211209f, 8})
  287. .set_dtype(1, dtype::Quantized4Asymm{1.f, 4})
  288. .set_rng(0, &u4)
  289. .set_param(param)
  290. .set_epsilon(1e-3)
  291. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  292. }
  293. }
  294. }
  295. }
  296. }
  297. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NHWC) {
  298. Checker<RelayoutFormat> checker(handle_cuda());
  299. UniformIntRNG s4{-8, 7};
  300. UniformIntRNG u4{0, 15};
  301. param::RelayoutFormat param;
  302. param.mode = param::RelayoutFormat::Mode::NCHW_NHWC;
  303. for (size_t n : {1, 3}) {
  304. for (size_t c : {8, 16}) {
  305. for (size_t h : {7, 14, 16, 28}) {
  306. for (size_t w : {2, 3, 7, 8, 16, 31}) {
  307. checker.set_dtype(0, dtype::QuantizedS4{2.f})
  308. .set_dtype(1, dtype::QuantizedS4{2.f})
  309. .set_rng(0, &s4)
  310. .set_param(param)
  311. .execs({{n, c, h, w}, {}});
  312. checker.set_dtype(0, dtype::Quantized4Asymm{1.2f, 8})
  313. .set_dtype(1, dtype::Quantized4Asymm{1.2f, 4})
  314. .set_rng(0, &u4)
  315. .set_param(param)
  316. .execs({{n, c, h, w}, {}});
  317. checker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  318. .set_dtype(1, dtype::QuantizedS4{1.f})
  319. .set_rng(0, &s4)
  320. .set_param(param)
  321. .execs({{n, c, h, w}, {}});
  322. checker.set_dtype(0, dtype::Quantized4Asymm{1.19990307f, 8})
  323. .set_dtype(1, dtype::Quantized4Asymm{1.f, 4})
  324. .set_rng(0, &u4)
  325. .set_param(param)
  326. .set_epsilon(1e-3)
  327. .execs({{n, c, h, w}, {}});
  328. }
  329. }
  330. }
  331. }
  332. checker.execs({{1, 256, 384, 640}, {}});
  333. }
  334. TEST_F(CUDA, RELAYOUT_FORMAT_NHWC_NCHW) {
  335. Checker<RelayoutFormat> checker(handle_cuda());
  336. UniformIntRNG s4{-8, 7};
  337. UniformIntRNG u4{0, 15};
  338. param::RelayoutFormat param;
  339. param.mode = param::RelayoutFormat::Mode::NHWC_NCHW;
  340. for (size_t n : {1, 3}) {
  341. for (size_t c : {8, 16}) {
  342. for (size_t h : {7, 14, 16, 28}) {
  343. for (size_t w : {2, 3, 4, 7, 14, 16, 17}) {
  344. checker.set_dtype(0, dtype::QuantizedS4{2.f})
  345. .set_dtype(1, dtype::QuantizedS4{2.f})
  346. .set_rng(0, &s4)
  347. .set_param(param)
  348. .set_epsilon(1e-3)
  349. .execs({{n, h, w, c}, {}});
  350. checker.set_dtype(0, dtype::Quantized4Asymm{1.2f, 4})
  351. .set_dtype(1, dtype::Quantized4Asymm{1.2f, 8})
  352. .set_rng(0, &u4)
  353. .set_param(param)
  354. .set_epsilon(1e-3)
  355. .execs({{n, h, w, c}, {}});
  356. checker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  357. .set_dtype(1, dtype::QuantizedS4{1.f})
  358. .set_rng(0, &s4)
  359. .set_param(param)
  360. .set_epsilon(1e-3)
  361. .execs({{n, h, w, c}, {}});
  362. checker.set_dtype(0, dtype::Quantized4Asymm{1.20211209f, 8})
  363. .set_dtype(1, dtype::Quantized4Asymm{1.f, 4})
  364. .set_rng(0, &u4)
  365. .set_param(param)
  366. .set_epsilon(1e-3)
  367. .execs({{n, h, w, c}, {}});
  368. }
  369. }
  370. }
  371. }
  372. checker.execs({{1, 384, 640, 256}, {}});
  373. }
  374. #if MEGDNN_WITH_BENCHMARK
  375. TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) {
  376. using Param = RelayoutFormat::Param;
  377. auto run = [&](const TensorShapeArray& shapes, Param param,
  378. Param default_param) {
  379. Benchmarker<RelayoutFormat> benchmarker(handle_cuda());
  380. benchmarker.set_param(param);
  381. benchmarker.set_dtype(0, dtype::QuantizedS8{1.f})
  382. .set_dtype(1, dtype::QuantizedS8{1.f});
  383. Benchmarker<RelayoutFormat> benchmarker_default(handle_cuda());
  384. benchmarker_default.set_param(default_param);
  385. benchmarker_default.set_dtype(0, dtype::QuantizedS8{1.f})
  386. .set_dtype(1, dtype::QuantizedS8{1.f});
  387. for (auto&& shape : shapes) {
  388. double memaccess = (double(shape.total_nr_elems()) +
  389. double(shape[0]) * ((shape[1] + 3) / 4 * 4) *
  390. shape[2] * shape[3]) *
  391. 1e-6;
  392. auto time_ms = benchmarker.execs({shape, {}});
  393. if (shape[1] <= 4) {
  394. auto time_default_ms = benchmarker_default.execs({shape, {}});
  395. printf("execute %s, time %.4f ms, %.4f GB/s, default %.4f "
  396. "GB/s\n",
  397. shape.to_string().c_str(), time_ms, memaccess / time_ms,
  398. memaccess / time_default_ms);
  399. } else {
  400. printf("execute %s, time %.4f ms, %.4f GB/s\n",
  401. shape.to_string().c_str(), time_ms, memaccess / time_ms);
  402. }
  403. }
  404. };
  405. TensorShapeArray shapes = {
  406. {8, 1, 768, 1280}, {8, 3, 768, 1280}, {8, 3, 224, 224},
  407. {8, 4, 768, 1280}, {64, 3, 768, 1280},
  408. };
  409. {
  410. Param param;
  411. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  412. Param default_param;
  413. default_param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  414. run(shapes, param, default_param);
  415. }
  416. }
  417. TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT_QS4) {
  418. using Param = RelayoutFormat::Param;
  419. auto run = [&](const TensorShapeArray& shapes, Param param) {
  420. CUBenchmarker<RelayoutFormat> benchmarker(handle_cuda());
  421. benchmarker.set_param(param);
  422. benchmarker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  423. .set_dtype(1, dtype::QuantizedS4{1.19990307f});
  424. for (auto&& shape : shapes) {
  425. double memaccess =
  426. double(TensorLayout(shape, dtype::QuantizedS4{1.f})
  427. .span()
  428. .dist_byte()) *
  429. 2e-6;
  430. auto time_ms = benchmarker.execs({shape, {}});
  431. printf("execute %s, time %.4f ms, %.4f GB/s\n",
  432. shape.to_string().c_str(), time_ms, memaccess / time_ms);
  433. }
  434. };
  435. printf("nchw -> nchw64\n");
  436. {
  437. TensorShapeArray shapes = {
  438. {1, 64, 56, 56}, {16, 64, 56, 56}, {64, 64, 56, 56},
  439. {1, 64, 56, 55}, {16, 64, 56, 55}, {64, 64, 56, 55},
  440. {1, 256, 384, 640},
  441. };
  442. Param param;
  443. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW64;
  444. run(shapes, param);
  445. }
  446. printf("nchw -> nhwc\n");
  447. {
  448. TensorShapeArray shapes = {
  449. {1, 64, 56, 56}, {16, 64, 56, 56}, {64, 64, 56, 56},
  450. {1, 64, 56, 55}, {16, 64, 56, 55}, {64, 64, 56, 55},
  451. {1, 256, 384, 640}, {16, 16, 384, 640},
  452. };
  453. Param param;
  454. param.mode = param::RelayoutFormat::Mode::NCHW_NHWC;
  455. run(shapes, param);
  456. }
  457. printf("nchw64 -> nchw\n");
  458. {
  459. TensorShapeArray shapes = {
  460. {64, 1, 56, 56, 64},
  461. {1, 32, 7, 7, 64},
  462. {16, 32, 7, 7, 64},
  463. {64, 32, 7, 7, 64},
  464. {1, 4, 384, 640, 64},
  465. };
  466. Param param;
  467. param.mode = param::RelayoutFormat::Mode::NCHW64_NCHW;
  468. run(shapes, param);
  469. }
  470. printf("nhwc -> nchw\n");
  471. {
  472. TensorShapeArray shapes = {
  473. {64, 56, 56, 64},
  474. {1, 7, 7, 64*32},
  475. {16, 7, 7, 64*32},
  476. {64, 7, 7, 64*32},
  477. {1, 384, 640, 64*4},
  478. };
  479. Param param;
  480. param.mode = param::RelayoutFormat::Mode::NHWC_NCHW;
  481. run(shapes, param);
  482. }
  483. }
  484. #endif
  485. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {
  486. Checker<RelayoutFormat> checker(handle_cuda());
  487. UniformIntRNG rng{-50, 50};
  488. param::RelayoutFormat param;
  489. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  490. for (DType dtype :
  491. std::vector<DType>({dtype::QuantizedS8{0.1f}, dtype::Float32{}})) {
  492. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_rng(0, &rng);
  493. checker.set_param(param).execs({{2, 4, 35, 36}, {}});
  494. checker.set_param(param).execs({{2, 3, 35, 36}, {}});
  495. checker.set_param(param).execs({{2, 1, 35, 36}, {}});
  496. param.mode = param::RelayoutFormat::Mode::
  497. NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT;
  498. checker.set_param(param).execs({{4, 3, 3, 3}, {}});
  499. checker.set_param(param).execs({{4, 4, 3, 3}, {}});
  500. checker.set_param(param).execs({{1, 4, 3, 3}, {}});
  501. }
  502. }
  503. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台