You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout_format.cpp 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. #include "megdnn/dtype.h"
  2. #include "megdnn/oprs.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/rng.h"
  5. #include "test/cuda/benchmark.h"
  6. #include "test/cuda/fixture.h"
  7. using namespace megdnn;
  8. using namespace test;
  9. TEST_F(CUDA, RELAYOUT_FORMAT) {
  10. Checker<RelayoutFormat> checker(handle_cuda());
  11. UniformIntRNG rng{-50, 50};
  12. param::RelayoutFormat param;
  13. param.mode = param::RelayoutFormat::Mode::NCHW4_CHWN4;
  14. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  15. .set_dtype(1, dtype::QuantizedS8{0.1f})
  16. .set_rng(0, &rng)
  17. .set_param(param)
  18. .execs({{22, 23, 24, 25, 4}, {}});
  19. param.mode = param::RelayoutFormat::Mode::CHWN4_NCHW4;
  20. checker.execs({{22, 23, 24, 25, 4}, {}});
  21. }
  22. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4_NCHW) {
  23. Checker<RelayoutFormat> checker(handle_cuda());
  24. UniformIntRNG rng{-50, 50};
  25. UniformIntRNG u8_rng{0, 255};
  26. param::RelayoutFormat param;
  27. param.mode = param::RelayoutFormat::Mode::NCHW4_NCHW;
  28. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  29. .set_dtype(1, dtype::QuantizedS8{0.1f})
  30. .set_rng(0, &rng)
  31. .set_param(param)
  32. .execs({{1, 1, 2, 2, 4}, {}});
  33. checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
  34. .set_dtype(1, dtype::Quantized8Asymm{1.f, 128})
  35. .set_rng(0, &u8_rng)
  36. .set_param(param)
  37. .execs({{1, 1, 2, 2, 4}, {}});
  38. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  39. .set_dtype(1, dtype::QuantizedS8{0.1f})
  40. .set_rng(0, &rng)
  41. .set_param(param)
  42. .execs({{22, 23, 24, 25, 4}, {}});
  43. param.oc = 90;
  44. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  45. .set_dtype(1, dtype::QuantizedS8{0.1f})
  46. .set_rng(0, &rng)
  47. .set_param(param)
  48. .execs({{22, 23, 24, 25, 4}, {}});
  49. param.oc = 16;
  50. param.group = 8;
  51. checker.set_dtype(0, dtype::QuantizedS8{0.1f})
  52. .set_dtype(1, dtype::QuantizedS8{0.1f})
  53. .set_rng(0, &rng)
  54. .set_param(param)
  55. .execs({{11, 16, 22, 33, 4}, {}});
  56. }
  57. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4) {
  58. Checker<RelayoutFormat> checker(handle_cuda());
  59. UniformIntRNG rng{-50, 50};
  60. param::RelayoutFormat param;
  61. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  62. for (size_t n : {1, 3}) {
  63. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  64. for (size_t h : {3, 7, 12, 16, 22, 59, 83}) {
  65. for (size_t w : {3, 22, 63, 128, 256}) {
  66. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  67. .set_dtype(1, dtype::QuantizedS8{1.f})
  68. .set_rng(0, &rng)
  69. .set_param(param)
  70. .execs({{n, c, h, w}, {}});
  71. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  72. .set_dtype(1, dtype::QuantizedS8{2.f})
  73. .set_rng(0, &rng)
  74. .set_param(param)
  75. .execs({{n, c, h, w}, {}});
  76. checker.set_dtype(0, dtype::QuantizedS32{1.f})
  77. .set_dtype(1, dtype::QuantizedS32{1.f})
  78. .set_rng(0, &rng)
  79. .set_param(param)
  80. .execs({{n, c, h, w}, {}});
  81. }
  82. }
  83. }
  84. }
  85. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  86. .set_dtype(1, dtype::QuantizedS8{1.f})
  87. .set_rng(0, &rng)
  88. .set_param(param)
  89. .execs({{8, 3, 224, 224}, {}});
  90. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  91. .set_dtype(1, dtype::QuantizedS8{1.f})
  92. .set_rng(0, &rng)
  93. .set_param(param)
  94. .execs({{8, 3, 600, 600}, {}});
  95. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  96. .set_dtype(1, dtype::QuantizedS8{1.f})
  97. .set_rng(0, &rng)
  98. .set_param(param)
  99. .execs({{1, 6, 768, 1280}, {}});
  100. param.group = 2;
  101. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  102. .set_dtype(1, dtype::QuantizedS8{1.f})
  103. .set_rng(0, &rng)
  104. .set_param(param)
  105. .execs({{8, 6, 300, 300}, {}});
  106. param.group = 3;
  107. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  108. .set_dtype(1, dtype::QuantizedS8{1.f})
  109. .set_rng(0, &rng)
  110. .set_param(param)
  111. .execs({{8, 6, 300, 300}, {}});
  112. }
  113. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_WEIGHT) {
  114. Checker<RelayoutFormat> checker(handle_cuda());
  115. UniformIntRNG rng{-50, 50};
  116. param::RelayoutFormat param;
  117. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_WEIGHT;
  118. for (size_t oc : {1, 3, 4, 16, 33}) {
  119. for (size_t ic : {1, 2, 3, 4, 8, 9, 11, 16, 33}) {
  120. for (size_t h : {3, 5, 7}) {
  121. for (size_t w : {3, 5, 7}) {
  122. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  123. .set_dtype(1, dtype::QuantizedS8{1.f})
  124. .set_rng(0, &rng)
  125. .set_param(param)
  126. .execs({{oc, ic, h, w}, {}});
  127. }
  128. }
  129. }
  130. }
  131. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  132. .set_dtype(1, dtype::QuantizedS8{1.f})
  133. .set_rng(0, &rng)
  134. .set_param(param)
  135. .execs({{13, 13, 5, 5}, {}});
  136. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  137. .set_dtype(1, dtype::QuantizedS8{1.f})
  138. .set_rng(0, &rng)
  139. .set_param(param)
  140. .execs({{4, 16, 16, 3, 3}, {}});
  141. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  142. .set_dtype(1, dtype::QuantizedS8{1.f})
  143. .set_rng(0, &rng)
  144. .set_param(param)
  145. .execs({{4, 13, 11, 3, 3}, {}});
  146. }
  147. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_DEFAULT) {
  148. Checker<RelayoutFormat> checker(handle_cuda());
  149. UniformIntRNG rng{0, 50};
  150. param::RelayoutFormat param;
  151. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  152. for (size_t n : {1, 3}) {
  153. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  154. for (size_t h : {3, 7, 12, 16, 59, 83}) {
  155. for (size_t w : {3, 63, 128, 256}) {
  156. checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
  157. .set_dtype(1, dtype::QuantizedS8{1.f})
  158. .set_rng(0, &rng)
  159. .set_param(param)
  160. .execs({{n, c, h, w}, {}});
  161. }
  162. }
  163. }
  164. }
  165. }
  166. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_U8) {
  167. Checker<RelayoutFormat> checker(handle_cuda());
  168. UniformIntRNG rng{0, 255};
  169. param::RelayoutFormat param;
  170. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  171. for (size_t n : {1, 3}) {
  172. for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
  173. for (size_t h : {3, 7, 12, 16, 59, 83}) {
  174. for (size_t w : {3, 13, 3 * 4, 63 * 4, 128 * 4, 256 * 4}) {
  175. checker.set_dtype(0, dtype::Uint8())
  176. .set_dtype(1, dtype::QuantizedS8{1.f})
  177. .set_rng(0, &rng)
  178. .set_param(param)
  179. .execs({{n, c, h, w}, {}});
  180. checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
  181. .set_dtype(1, dtype::QuantizedS8{1.f})
  182. .set_rng(0, &rng)
  183. .set_param(param)
  184. .execs({{n, c, h, w}, {}});
  185. checker.set_dtype(0, dtype::Uint8())
  186. .set_dtype(1, dtype::QuantizedS8{2.5f})
  187. .set_rng(0, &rng)
  188. .set_param(param)
  189. .execs({{n, c, h, w}, {}});
  190. }
  191. }
  192. }
  193. }
  194. }
  195. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_IC_SMALL) {
  196. Checker<RelayoutFormat> checker(handle_cuda());
  197. UniformIntRNG rng{0, 50};
  198. param::RelayoutFormat param;
  199. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  200. checker.set_dtype(0, dtype::QuantizedS8{1.f})
  201. .set_dtype(1, dtype::QuantizedS8{1.f})
  202. .set_rng(0, &rng)
  203. .set_param(param)
  204. .execs({{8, 3, 768, 1280}, {}});
  205. }
  206. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW64) {
  207. Checker<RelayoutFormat> checker(handle_cuda());
  208. UniformIntRNG s4{-8, 7};
  209. UniformIntRNG u4{0, 15};
  210. param::RelayoutFormat param;
  211. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW64;
  212. for (size_t n : {1, 3}) {
  213. for (size_t c : {15, 64, 128}) {
  214. for (size_t h : {7, 14, 16, 28}) {
  215. for (size_t w : {2, 3, 7, 8, 16, 31}) {
  216. checker.set_dtype(0, dtype::QuantizedS4{2.f})
  217. .set_dtype(1, dtype::QuantizedS4{2.f})
  218. .set_rng(0, &s4)
  219. .set_param(param)
  220. .execs({{n, c, h, w}, {}});
  221. checker.set_dtype(0, dtype::Quantized4Asymm{1.2f, 8})
  222. .set_dtype(1, dtype::Quantized4Asymm{1.2f, 4})
  223. .set_rng(0, &u4)
  224. .set_param(param)
  225. .execs({{n, c, h, w}, {}});
  226. checker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  227. .set_dtype(1, dtype::QuantizedS4{1.f})
  228. .set_rng(0, &s4)
  229. .set_param(param)
  230. .execs({{n, c, h, w}, {}});
  231. checker.set_dtype(0, dtype::Quantized4Asymm{1.19990307f, 8})
  232. .set_dtype(1, dtype::Quantized4Asymm{1.f, 4})
  233. .set_rng(0, &u4)
  234. .set_param(param)
  235. .set_epsilon(1e-3)
  236. .execs({{n, c, h, w}, {}});
  237. }
  238. }
  239. }
  240. }
  241. }
  242. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW64_NCHW) {
  243. Checker<RelayoutFormat> checker(handle_cuda());
  244. UniformIntRNG s4{-8, 7};
  245. UniformIntRNG u4{0, 15};
  246. param::RelayoutFormat param;
  247. param.mode = param::RelayoutFormat::Mode::NCHW64_NCHW;
  248. for (size_t n : {1, 3}) {
  249. for (size_t c : {15, 64, 128}) {
  250. for (size_t h : {7, 14, 16, 28}) {
  251. for (size_t w : {2, 3, 4, 7, 14, 16, 17}) {
  252. if (c % 64 != 0) {
  253. param.oc = c;
  254. } else {
  255. param.oc = 0;
  256. }
  257. checker.set_dtype(0, dtype::QuantizedS4{2.f})
  258. .set_dtype(1, dtype::QuantizedS4{2.f})
  259. .set_rng(0, &s4)
  260. .set_param(param)
  261. .set_epsilon(1e-3)
  262. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  263. checker.set_dtype(0, dtype::Quantized4Asymm{1.2f, 4})
  264. .set_dtype(1, dtype::Quantized4Asymm{1.2f, 8})
  265. .set_rng(0, &u4)
  266. .set_param(param)
  267. .set_epsilon(1e-3)
  268. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  269. checker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  270. .set_dtype(1, dtype::QuantizedS4{1.f})
  271. .set_rng(0, &s4)
  272. .set_param(param)
  273. .set_epsilon(1e-3)
  274. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  275. checker.set_dtype(0, dtype::Quantized4Asymm{1.20211209f, 8})
  276. .set_dtype(1, dtype::Quantized4Asymm{1.f, 4})
  277. .set_rng(0, &u4)
  278. .set_param(param)
  279. .set_epsilon(1e-3)
  280. .execs({{n, (c + 63) / 64, h, w, 64}, {}});
  281. }
  282. }
  283. }
  284. }
  285. }
  286. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NHWC) {
  287. Checker<RelayoutFormat> checker(handle_cuda());
  288. UniformIntRNG s4{-8, 7};
  289. UniformIntRNG u4{0, 15};
  290. param::RelayoutFormat param;
  291. param.mode = param::RelayoutFormat::Mode::NCHW_NHWC;
  292. for (size_t n : {1, 3}) {
  293. for (size_t c : {8, 16}) {
  294. for (size_t h : {7, 14, 16, 28}) {
  295. for (size_t w : {2, 3, 7, 8, 16, 31}) {
  296. checker.set_dtype(0, dtype::QuantizedS4{2.f})
  297. .set_dtype(1, dtype::QuantizedS4{2.f})
  298. .set_rng(0, &s4)
  299. .set_param(param)
  300. .execs({{n, c, h, w}, {}});
  301. checker.set_dtype(0, dtype::Quantized4Asymm{1.2f, 8})
  302. .set_dtype(1, dtype::Quantized4Asymm{1.2f, 4})
  303. .set_rng(0, &u4)
  304. .set_param(param)
  305. .execs({{n, c, h, w}, {}});
  306. checker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  307. .set_dtype(1, dtype::QuantizedS4{1.f})
  308. .set_rng(0, &s4)
  309. .set_param(param)
  310. .execs({{n, c, h, w}, {}});
  311. checker.set_dtype(0, dtype::Quantized4Asymm{1.19990307f, 8})
  312. .set_dtype(1, dtype::Quantized4Asymm{1.f, 4})
  313. .set_rng(0, &u4)
  314. .set_param(param)
  315. .set_epsilon(1e-3)
  316. .execs({{n, c, h, w}, {}});
  317. }
  318. }
  319. }
  320. }
  321. checker.execs({{1, 256, 384, 640}, {}});
  322. }
  323. TEST_F(CUDA, RELAYOUT_FORMAT_NHWC_NCHW) {
  324. Checker<RelayoutFormat> checker(handle_cuda());
  325. UniformIntRNG s4{-8, 7};
  326. UniformIntRNG u4{0, 15};
  327. param::RelayoutFormat param;
  328. param.mode = param::RelayoutFormat::Mode::NHWC_NCHW;
  329. for (size_t n : {1, 3}) {
  330. for (size_t c : {8, 16}) {
  331. for (size_t h : {7, 14, 16, 28}) {
  332. for (size_t w : {2, 3, 4, 7, 14, 16, 17}) {
  333. checker.set_dtype(0, dtype::QuantizedS4{2.f})
  334. .set_dtype(1, dtype::QuantizedS4{2.f})
  335. .set_rng(0, &s4)
  336. .set_param(param)
  337. .set_epsilon(1e-3)
  338. .execs({{n, h, w, c}, {}});
  339. checker.set_dtype(0, dtype::Quantized4Asymm{1.2f, 4})
  340. .set_dtype(1, dtype::Quantized4Asymm{1.2f, 8})
  341. .set_rng(0, &u4)
  342. .set_param(param)
  343. .set_epsilon(1e-3)
  344. .execs({{n, h, w, c}, {}});
  345. checker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  346. .set_dtype(1, dtype::QuantizedS4{1.f})
  347. .set_rng(0, &s4)
  348. .set_param(param)
  349. .set_epsilon(1e-3)
  350. .execs({{n, h, w, c}, {}});
  351. checker.set_dtype(0, dtype::Quantized4Asymm{1.20211209f, 8})
  352. .set_dtype(1, dtype::Quantized4Asymm{1.f, 4})
  353. .set_rng(0, &u4)
  354. .set_param(param)
  355. .set_epsilon(1e-3)
  356. .execs({{n, h, w, c}, {}});
  357. }
  358. }
  359. }
  360. }
  361. checker.execs({{1, 384, 640, 256}, {}});
  362. }
  363. #if MEGDNN_WITH_BENCHMARK
  364. TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) {
  365. using Param = RelayoutFormat::Param;
  366. auto run = [&](const TensorShapeArray& shapes, Param param, Param default_param) {
  367. Benchmarker<RelayoutFormat> benchmarker(handle_cuda());
  368. benchmarker.set_param(param);
  369. benchmarker.set_dtype(0, dtype::QuantizedS8{1.f})
  370. .set_dtype(1, dtype::QuantizedS8{1.f});
  371. Benchmarker<RelayoutFormat> benchmarker_default(handle_cuda());
  372. benchmarker_default.set_param(default_param);
  373. benchmarker_default.set_dtype(0, dtype::QuantizedS8{1.f})
  374. .set_dtype(1, dtype::QuantizedS8{1.f});
  375. for (auto&& shape : shapes) {
  376. double memaccess = (double(shape.total_nr_elems()) +
  377. double(shape[0]) * ((shape[1] + 3) / 4 * 4) * shape[2] *
  378. shape[3]) *
  379. 1e-6;
  380. auto time_ms = benchmarker.execs({shape, {}});
  381. if (shape[1] <= 4) {
  382. auto time_default_ms = benchmarker_default.execs({shape, {}});
  383. printf("execute %s, time %.4f ms, %.4f GB/s, default %.4f "
  384. "GB/s\n",
  385. shape.to_string().c_str(), time_ms, memaccess / time_ms,
  386. memaccess / time_default_ms);
  387. } else {
  388. printf("execute %s, time %.4f ms, %.4f GB/s\n",
  389. shape.to_string().c_str(), time_ms, memaccess / time_ms);
  390. }
  391. }
  392. };
  393. TensorShapeArray shapes = {
  394. {8, 1, 768, 1280}, {8, 3, 768, 1280}, {8, 3, 224, 224},
  395. {8, 4, 768, 1280}, {64, 3, 768, 1280},
  396. };
  397. {
  398. Param param;
  399. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
  400. Param default_param;
  401. default_param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  402. run(shapes, param, default_param);
  403. }
  404. }
  405. TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT_QS4) {
  406. using Param = RelayoutFormat::Param;
  407. auto run = [&](const TensorShapeArray& shapes, Param param) {
  408. CUBenchmarker<RelayoutFormat> benchmarker(handle_cuda());
  409. benchmarker.set_param(param);
  410. benchmarker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
  411. .set_dtype(1, dtype::QuantizedS4{1.19990307f});
  412. for (auto&& shape : shapes) {
  413. double memaccess = double(TensorLayout(shape, dtype::QuantizedS4{1.f})
  414. .span()
  415. .dist_byte()) *
  416. 2e-6;
  417. auto time_ms = benchmarker.execs({shape, {}});
  418. printf("execute %s, time %.4f ms, %.4f GB/s\n", shape.to_string().c_str(),
  419. time_ms, memaccess / time_ms);
  420. }
  421. };
  422. printf("nchw -> nchw64\n");
  423. {
  424. TensorShapeArray shapes = {
  425. {1, 64, 56, 56}, {16, 64, 56, 56}, {64, 64, 56, 56}, {1, 64, 56, 55},
  426. {16, 64, 56, 55}, {64, 64, 56, 55}, {1, 256, 384, 640},
  427. };
  428. Param param;
  429. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW64;
  430. run(shapes, param);
  431. }
  432. printf("nchw -> nhwc\n");
  433. {
  434. TensorShapeArray shapes = {
  435. {1, 64, 56, 56}, {16, 64, 56, 56}, {64, 64, 56, 56},
  436. {1, 64, 56, 55}, {16, 64, 56, 55}, {64, 64, 56, 55},
  437. {1, 256, 384, 640}, {16, 16, 384, 640},
  438. };
  439. Param param;
  440. param.mode = param::RelayoutFormat::Mode::NCHW_NHWC;
  441. run(shapes, param);
  442. }
  443. printf("nchw64 -> nchw\n");
  444. {
  445. TensorShapeArray shapes = {
  446. {64, 1, 56, 56, 64}, {1, 32, 7, 7, 64}, {16, 32, 7, 7, 64},
  447. {64, 32, 7, 7, 64}, {1, 4, 384, 640, 64},
  448. };
  449. Param param;
  450. param.mode = param::RelayoutFormat::Mode::NCHW64_NCHW;
  451. run(shapes, param);
  452. }
  453. printf("nhwc -> nchw\n");
  454. {
  455. TensorShapeArray shapes = {
  456. {64, 56, 56, 64}, {1, 7, 7, 64 * 32}, {16, 7, 7, 64 * 32},
  457. {64, 7, 7, 64 * 32}, {1, 384, 640, 64 * 4},
  458. };
  459. Param param;
  460. param.mode = param::RelayoutFormat::Mode::NHWC_NCHW;
  461. run(shapes, param);
  462. }
  463. }
  464. #endif
  465. TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {
  466. Checker<RelayoutFormat> checker(handle_cuda());
  467. UniformIntRNG rng{-50, 50};
  468. param::RelayoutFormat param;
  469. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
  470. for (DType dtype :
  471. std::vector<DType>({dtype::QuantizedS8{0.1f}, dtype::Float32{}})) {
  472. checker.set_dtype(0, dtype).set_dtype(1, dtype).set_rng(0, &rng);
  473. checker.set_param(param).execs({{2, 4, 35, 36}, {}});
  474. checker.set_param(param).execs({{2, 3, 35, 36}, {}});
  475. checker.set_param(param).execs({{2, 1, 35, 36}, {}});
  476. param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT;
  477. checker.set_param(param).execs({{4, 3, 3, 3}, {}});
  478. checker.set_param(param).execs({{4, 4, 3, 3}, {}});
  479. checker.set_param(param).execs({{1, 4, 3, 3}, {}});
  480. }
  481. }
  482. // vim: syntax=cpp.doxygen