You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout.cpp 40 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979
  1. #include "test/common/relayout.h"
  2. #include "megdnn/oprs.h"
  3. #include "test/common/checker.h"
  4. #include "test/common/rng.h"
  5. #include "test/cuda/benchmark.h"
  6. #include "test/cuda/fixture.h"
  7. using namespace megdnn;
  8. using namespace test;
  9. namespace {
  10. template <typename tag>
  11. class CUDA_RELAYOUT : public CUDA {};
  12. TYPED_TEST_CASE(CUDA_RELAYOUT, relayout::test_types);
  13. TYPED_TEST(CUDA_RELAYOUT, run) {
  14. relayout::run_test<TypeParam>(this->handle_cuda());
  15. }
  16. } // namespace
  17. TEST_F(CUDA, RELAYOUT_TRANSPOSE) {
  18. Checker<Relayout> checker(handle_cuda());
  19. auto run = [&](size_t batch, size_t m, size_t n, size_t c, DType dtype) {
  20. checker.set_dtype(0, dtype).set_dtype(1, dtype);
  21. TensorLayout src = {{batch, m, n, c}, dtype};
  22. src.init_contiguous_stride();
  23. TensorLayout dst = {{batch, m, n, c}, dtype};
  24. dst.stride[0] = m * n * c;
  25. dst.stride[1] = c;
  26. dst.stride[2] = m * c;
  27. dst.stride[3] = 1;
  28. checker.execl({src, dst});
  29. };
  30. run(16, 30, 40, 4, dtype::Int8());
  31. run(16, 20, 10, 4, dtype::Int8());
  32. run(1, 30, 20, 1, dtype::Int32());
  33. run(1, 20, 30, 1, dtype::Int32());
  34. run(1, 11, 21, 1, dtype::Float32());
  35. }
  36. #if MEGDNN_WITH_BENCHMARK
  37. TEST_F(CUDA, BENCHMARK_RELAYOUT_TRANSPOSE) {
  38. static constexpr size_t RUNS = 1000;
  39. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  40. benchmarker.set_times(RUNS);
  41. auto run = [&](size_t batch, size_t m, size_t n, size_t c, DType dtype) {
  42. benchmarker.set_dtype(0, dtype).set_dtype(1, dtype);
  43. TensorLayout src = {{batch, m, n, c}, dtype};
  44. src.init_contiguous_stride();
  45. TensorLayout dst = {{batch, m, n, c}, dtype};
  46. dst.stride[0] = m * n * c;
  47. dst.stride[1] = c;
  48. dst.stride[2] = m * c;
  49. dst.stride[3] = 1;
  50. auto time_ms = benchmarker.execl({src, dst}) / RUNS;
  51. printf("{%zux%zux%zux%zu}->{%zux%zux%zux%zu} bandwidth: %.2f gbps\n", batch, m,
  52. n, c, batch, n, m, c,
  53. 2.f * batch * m * n * c * dtype.size() / (1e6 * time_ms));
  54. };
  55. run(16, 640, 480, 4, dtype::Int8());
  56. run(256, 224, 224, 4, dtype::Int8());
  57. run(1, 256, 224 * 224, 1, dtype::Int32());
  58. run(1, 256, 7 * 7 * 512, 1, dtype::Int32());
  59. run(1, 4096, 4096, 1, dtype::Float32());
  60. }
  61. TEST_F(CUDA, BENCHMARK_RELAYOUT) {
  62. //! benchmark contious layout, such as (a, b, c, d) -> (b, a, c,d)
  63. //! just change the first two axis
  64. static constexpr size_t RUNS = 3;
  65. auto run = [&](const TensorLayoutArray& layouts) {
  66. Benchmarker<Relayout> benchmarker(handle_cuda());
  67. benchmarker.set_times(RUNS);
  68. for (auto&& layout : layouts) {
  69. TensorLayout src = layout.dimshuffle({1, 0, 2});
  70. TensorLayout dst = layout;
  71. std::swap(dst.shape[0], dst.shape[1]);
  72. dst.init_contiguous_stride();
  73. auto used = benchmarker.execl({src, dst});
  74. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  75. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  76. 1000 / (1024 * 1024 * 1024));
  77. }
  78. };
  79. TensorLayoutArray layouts = {
  80. {{12, 23, 2}, dtype::Int32()}, {{12, 23, 8}, dtype::Int32()},
  81. {{12, 23, 17}, dtype::Int32()}, {{12, 23, 64}, dtype::Int32()},
  82. {{12, 23, 129}, dtype::Int32()}, {{12, 23, 256}, dtype::Int32()},
  83. {{12, 23, 1029}, dtype::Int32()}, {{12, 23, 4096}, dtype::Int32()},
  84. {{12, 23, 9143}, dtype::Int32()}, {{12, 23, 18284}, dtype::Int32()},
  85. {{2, 2, 1000000}, dtype::Int32()},
  86. };
  87. run(layouts);
  88. auto run2 = [&](const TensorLayoutArray& layouts) {
  89. Benchmarker<Relayout> benchmarker(handle_cuda());
  90. benchmarker.set_times(RUNS);
  91. for (auto&& layout : layouts) {
  92. TensorLayout src = layout.dimshuffle({0, 2, 1, 3});
  93. TensorLayout dst = layout;
  94. std::swap(dst.shape[0], dst.shape[1]);
  95. dst.init_contiguous_stride();
  96. auto used = benchmarker.execl({src, dst});
  97. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  98. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  99. 1000 / (1024 * 1024 * 1024));
  100. }
  101. };
  102. layouts = {
  103. {{3, 12, 24, 100}, dtype::Int32()},
  104. {{3, 12, 24, 1029}, dtype::Int32()},
  105. {{3, 4, 24, 9143}, dtype::Int32()},
  106. {{3, 4, 24, 18284}, dtype::Int32()},
  107. };
  108. run2(layouts);
  109. }
  110. TEST_F(CUDA, BENCHMARK_RELAYOUT_LAST_CONTIG) {
  111. //! src and dst are all get subtensor in channel axis
  112. static constexpr size_t RUNS = 3;
  113. Benchmarker<Relayout> benchmarker(handle_cuda());
  114. benchmarker.set_times(RUNS);
  115. TensorLayout src =
  116. TensorLayout({5, 5, 100000}, {800000, 100000, 1}, dtype::Float32());
  117. TensorLayout dst =
  118. TensorLayout({5, 5, 100000}, {700000, 100000, 1}, dtype::Float32());
  119. auto used = benchmarker.execl({src, dst});
  120. printf("src: %s dst: %s bandwith: %f gbps/s\n", src.to_string().c_str(),
  121. dst.to_string().c_str(),
  122. 2 * src.total_nr_elems() * src.dtype.size() * RUNS / used * 1000 /
  123. (1024 * 1024 * 1024));
  124. }
  125. TEST_F(CUDA, BENCHMARK_RELAYOUT_LAST_NOT_CONTIG) {
  126. static constexpr size_t RUNS = 3;
  127. auto run = [&](TensorLayout src, TensorLayout dst) {
  128. Benchmarker<Relayout> benchmarker(handle_cuda());
  129. auto&& layout = src;
  130. benchmarker.set_times(RUNS);
  131. dst.init_contiguous_stride();
  132. auto used = benchmarker.execl({src, dst});
  133. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  134. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  135. (1024 * 1024 * 1024));
  136. };
  137. run({{16, 128, 128}, {49152, 384, 3}, dtype::Float32()},
  138. {{16, 128, 128}, {16384, 128, 1}, dtype::Float32()});
  139. }
  140. TEST_F(CUDA, BENCHMARK_RELAYOUT_6) {
  141. static constexpr size_t RUNS = 3;
  142. auto run = [&](TensorLayoutArray layouts,
  143. std::vector<std::vector<size_t>> permutations) {
  144. Benchmarker<Relayout> benchmarker(handle_cuda());
  145. benchmarker.set_times(RUNS);
  146. int i = 0;
  147. for (auto&& layout : layouts) {
  148. auto per = permutations[i];
  149. TensorLayout src = layout.dimshuffle(per);
  150. TensorLayout dst = layout;
  151. std::swap(dst.shape[0], dst.shape[1]);
  152. dst.init_contiguous_stride();
  153. auto used = benchmarker.execl({src, dst});
  154. Checker<Relayout> checker(handle_cuda());
  155. checker.exec(TensorLayoutArray{src, dst});
  156. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  157. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  158. 1000 / (1024 * 1024 * 1024));
  159. i++;
  160. }
  161. };
  162. TensorLayoutArray layouts = {
  163. {{7248, 7248}, dtype::Int32()},
  164. {{43408, 1216}, dtype::Int32()},
  165. {{1216, 43408}, dtype::Int32()},
  166. {{368, 384, 384}, dtype::Int32()},
  167. {{2144, 64, 384}, dtype::Int32()},
  168. {{368, 64, 2307}, dtype::Int32()},
  169. {{384, 384, 355}, dtype::Int32()},
  170. {{2320, 384, 59}, dtype::Int32()},
  171. {{384, 2320, 59}, dtype::Int32()},
  172. {{384, 355, 384}, dtype::Int32()},
  173. {{2320, 59, 384}, dtype::Int32()},
  174. {{384, 59, 2320}, dtype::Int32()},
  175. {{80, 96, 75, 96}, dtype::Int32()},
  176. {{464, 16, 75, 96}, dtype::Int32()},
  177. {{80, 16, 75, 582}, dtype::Int32()},
  178. {{96, 75, 96, 75}, dtype::Int32()},
  179. {{608, 12, 96, 75}, dtype::Int32()},
  180. {{96, 12, 608, 75}, dtype::Int32()},
  181. {{96, 75, 96, 75}, dtype::Int32()},
  182. {{608, 12, 96, 75}, dtype::Int32()},
  183. {{96, 12, 608, 75}, dtype::Int32()},
  184. {{96, 96, 75, 75}, dtype::Int32()},
  185. {{608, 96, 12, 75}, dtype::Int32()},
  186. {{96, 608, 12, 75}, dtype::Int32()},
  187. {{96, 75, 75, 96}, dtype::Int32()},
  188. {{608, 12, 75, 96}, dtype::Int32()},
  189. {{96, 12, 75, 608}, dtype::Int32()},
  190. {{32, 48, 28, 28, 48}, dtype::Int32()},
  191. {{176, 8, 28, 28, 48}, dtype::Int32()},
  192. {{32, 8, 28, 28, 298}, dtype::Int32()},
  193. {{48, 28, 28, 48, 28}, dtype::Int32()},
  194. {{352, 4, 28, 48, 28}, dtype::Int32()},
  195. {{48, 4, 28, 352, 28}, dtype::Int32()},
  196. {{48, 28, 48, 28, 28}, dtype::Int32()},
  197. {{352, 4, 48, 28, 28}, dtype::Int32()},
  198. {{48, 4, 352, 28, 28}, dtype::Int32()},
  199. {{48, 48, 28, 28, 28}, dtype::Int32()},
  200. {{352, 48, 4, 28, 28}, dtype::Int32()},
  201. {{48, 352, 4, 28, 28}, dtype::Int32()},
  202. {{48, 28, 28, 28, 48}, dtype::Int32()},
  203. {{352, 4, 28, 28, 48}, dtype::Int32()},
  204. {{48, 4, 28, 28, 352}, dtype::Int32()},
  205. {{16, 32, 15, 32, 15, 15}, dtype::Int32()},
  206. {{48, 10, 15, 32, 15, 15}, dtype::Int32()},
  207. {{16, 10, 15, 103, 15, 15}, dtype::Int32()},
  208. {{32, 15, 15, 32, 15, 15}, dtype::Int32()},
  209. {{112, 5, 15, 32, 15, 15}, dtype::Int32()},
  210. {{32, 5, 15, 112, 15, 15}, dtype::Int32()},
  211. {{32, 15, 32, 15, 15, 15}, dtype::Int32()},
  212. {{112, 5, 32, 15, 15, 15}, dtype::Int32()},
  213. {{32, 5, 112, 15, 15, 15}, dtype::Int32()},
  214. {{32, 15, 15, 32, 15, 15}, dtype::Int32()},
  215. {{112, 5, 15, 32, 15, 15}, dtype::Int32()},
  216. {{32, 5, 15, 112, 15, 15}, dtype::Int32()},
  217. {{32, 15, 15, 15, 15, 32}, dtype::Int32()},
  218. {{112, 5, 15, 15, 15, 32}, dtype::Int32()},
  219. {{32, 5, 15, 15, 15, 112}, dtype::Int32()},
  220. };
  221. std::vector<std::vector<size_t>> permutations = {
  222. std::vector<size_t>{1, 0},
  223. std::vector<size_t>{1, 0},
  224. std::vector<size_t>{1, 0},
  225. std::vector<size_t>{0, 2, 1},
  226. std::vector<size_t>{0, 2, 1},
  227. std::vector<size_t>{0, 2, 1},
  228. std::vector<size_t>{1, 0, 2},
  229. std::vector<size_t>{1, 0, 2},
  230. std::vector<size_t>{1, 0, 2},
  231. std::vector<size_t>{2, 1, 0},
  232. std::vector<size_t>{2, 1, 0},
  233. std::vector<size_t>{2, 1, 0},
  234. std::vector<size_t>{0, 3, 2, 1},
  235. std::vector<size_t>{0, 3, 2, 1},
  236. std::vector<size_t>{0, 3, 2, 1},
  237. std::vector<size_t>{2, 1, 3, 0},
  238. std::vector<size_t>{2, 1, 3, 0},
  239. std::vector<size_t>{2, 1, 3, 0},
  240. std::vector<size_t>{2, 0, 3, 1},
  241. std::vector<size_t>{2, 0, 3, 1},
  242. std::vector<size_t>{2, 0, 3, 1},
  243. std::vector<size_t>{1, 0, 3, 2},
  244. std::vector<size_t>{1, 0, 3, 2},
  245. std::vector<size_t>{1, 0, 3, 2},
  246. std::vector<size_t>{3, 2, 1, 0},
  247. std::vector<size_t>{3, 2, 1, 0},
  248. std::vector<size_t>{3, 2, 1, 0},
  249. std::vector<size_t>{0, 4, 2, 1, 3},
  250. std::vector<size_t>{0, 4, 2, 1, 3},
  251. std::vector<size_t>{0, 4, 2, 1, 3},
  252. std::vector<size_t>{3, 2, 1, 4, 0},
  253. std::vector<size_t>{3, 2, 1, 4, 0},
  254. std::vector<size_t>{3, 2, 1, 4, 0},
  255. std::vector<size_t>{2, 0, 4, 1, 3},
  256. std::vector<size_t>{2, 0, 4, 1, 3},
  257. std::vector<size_t>{2, 0, 4, 1, 3},
  258. std::vector<size_t>{1, 3, 0, 4, 2},
  259. std::vector<size_t>{1, 3, 0, 4, 2},
  260. std::vector<size_t>{1, 3, 0, 4, 2},
  261. std::vector<size_t>{4, 3, 2, 1, 0},
  262. std::vector<size_t>{4, 3, 2, 1, 0},
  263. std::vector<size_t>{4, 3, 2, 1, 0},
  264. std::vector<size_t>{0, 3, 2, 5, 4, 1},
  265. std::vector<size_t>{0, 3, 2, 5, 4, 1},
  266. std::vector<size_t>{0, 3, 2, 5, 4, 1},
  267. std::vector<size_t>{3, 2, 0, 5, 1, 4},
  268. std::vector<size_t>{3, 2, 0, 5, 1, 4},
  269. std::vector<size_t>{3, 2, 0, 5, 1, 4},
  270. std::vector<size_t>{2, 0, 4, 1, 5, 3},
  271. std::vector<size_t>{2, 0, 4, 1, 5, 3},
  272. std::vector<size_t>{2, 0, 4, 1, 5, 3},
  273. std::vector<size_t>{3, 2, 5, 1, 0, 4},
  274. std::vector<size_t>{3, 2, 5, 1, 0, 4},
  275. std::vector<size_t>{3, 2, 5, 1, 0, 4},
  276. std::vector<size_t>{5, 4, 3, 2, 1, 0},
  277. std::vector<size_t>{5, 4, 3, 2, 1, 0},
  278. std::vector<size_t>{5, 4, 3, 2, 1, 0}};
  279. run(layouts, permutations);
  280. }
  281. TEST_F(CUDA, BENCHMARK_RELAYOUT_7) {
  282. static constexpr size_t RUNS = 3;
  283. auto isTrivial = [&](std::vector<size_t>& permutation) {
  284. for (size_t i = 0; i < permutation.size(); i++) {
  285. if (permutation[i] != i)
  286. return false;
  287. }
  288. return true;
  289. };
  290. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  291. Benchmarker<Relayout> benchmarker(handle_cuda());
  292. benchmarker.set_times(RUNS);
  293. TensorLayout src = layout.dimshuffle(per);
  294. TensorLayout dst = layout;
  295. std::swap(dst.shape[0], dst.shape[1]);
  296. dst.init_contiguous_stride();
  297. auto used = benchmarker.execl({src, dst});
  298. Checker<Relayout> checker(handle_cuda());
  299. checker.exec(TensorLayoutArray{src, dst});
  300. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  301. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  302. (1024 * 1024 * 1024));
  303. };
  304. std::vector<size_t> _dim = {5, 3, 2, 4, 35, 33, 37};
  305. std::vector<size_t> permutation(7);
  306. // Inverse
  307. for (size_t r = 0; r < _dim.size(); r++) {
  308. size_t size = _dim.size();
  309. permutation[r] = size - 1 - r;
  310. }
  311. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5], _dim[6]},
  312. dtype::Int32()},
  313. permutation);
  314. // Random
  315. for (size_t r = 0; r < _dim.size(); r++)
  316. permutation[r] = r;
  317. for (int nsample = 0; nsample < 50; nsample++) {
  318. COMPAT_RANDOM(_dim.begin(), _dim.end());
  319. COMPAT_RANDOM(permutation.begin(), permutation.end());
  320. if (!isTrivial(permutation)) {
  321. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5], _dim[6]},
  322. dtype::Int32()},
  323. permutation);
  324. }
  325. }
  326. }
  327. TEST_F(CUDA, BENCHMARK_RELAYOUT_5) {
  328. static constexpr size_t RUNS = 10;
  329. auto isTrivial = [&](std::vector<size_t>& permutation) {
  330. for (size_t i = 0; i < permutation.size(); i++) {
  331. if (permutation[i] != i)
  332. return false;
  333. }
  334. return true;
  335. };
  336. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  337. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  338. benchmarker.set_times(RUNS);
  339. TensorLayout src = layout.dimshuffle(per);
  340. TensorLayout dst = layout;
  341. // std::swap(dst.shape[0], dst.shape[1]);
  342. dst.init_contiguous_stride();
  343. auto used = benchmarker.execl({src, dst});
  344. Checker<Relayout> checker(handle_cuda());
  345. checker.exec(TensorLayoutArray{src, dst});
  346. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  347. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  348. (1024 * 1024 * 1024));
  349. };
  350. size_t two = 2;
  351. int ratio = 5;
  352. int numElemAvg = 1000000 * 200;
  353. UniformFloatRNG numElem_dist((double)numElemAvg, (double)numElemAvg * 0.2);
  354. for (int rank = 5; rank <= 5; rank++) {
  355. for (int iter = 0; iter < 20; iter++) {
  356. int numElem = (int)numElem_dist.gen_single_val();
  357. std::vector<size_t> dim(rank);
  358. std::vector<size_t> permutation(rank);
  359. std::vector<double> dimf(rank);
  360. double volf = 1.0;
  361. for (int r = 0; r < rank; r++) {
  362. permutation[r] = (size_t)r;
  363. dimf[r] = 1.0 + (double)r * (ratio - 1.0) / (double)(rank - 1);
  364. volf *= dimf[r];
  365. }
  366. // fprintf(stderr, "volf %lf\n", volf);
  367. double scale = pow((double)numElem / volf, 1.0 / (double)rank);
  368. // fprintf(stderr, "scale %lf\n", scale);
  369. int vol = 1;
  370. for (int r = 0; r < rank; r++) {
  371. if (r == rank - 1) {
  372. dim[r] = ratio * dim[0];
  373. } else {
  374. dim[r] = (size_t)round(dimf[r] * scale);
  375. }
  376. dim[r] = std::max(two, dim[r]);
  377. vol *= dim[r];
  378. }
  379. // fprintf(stderr, "dim[0] %lf\n", dim[0]);
  380. double cur_ratio = (double)dim[rank - 1] / (double)dim[0];
  381. double vol_re = fabs((double)(vol - numElem) / (double)numElem);
  382. // Fix dimensions if volume is off by more than 5%
  383. if (vol_re > 0.05) {
  384. size_t d = (vol < numElem) ? 1 : -1;
  385. int r = 1;
  386. while (vol_re > 0.05 && r < rank) {
  387. size_t dim_plus_d = std::max(two, dim[r] + d);
  388. vol = (vol / dim[r]) * dim_plus_d;
  389. dim[r] = dim_plus_d;
  390. vol_re = fabs((double)(vol - numElem) / (double)numElem);
  391. r++;
  392. }
  393. }
  394. size_t minDim = *(std::min_element(dim.begin(), dim.end()));
  395. size_t maxDim = *(std::max_element(dim.begin(), dim.end()));
  396. cur_ratio = (double)maxDim / (double)minDim;
  397. printf("vol %d cur_ratio %lf | %lf\n", vol, cur_ratio, vol_re);
  398. // printVec(dim);
  399. COMPAT_RANDOM(dim.begin(), dim.end());
  400. while (isTrivial(permutation)) {
  401. COMPAT_RANDOM(permutation.begin(), permutation.end());
  402. }
  403. run({{dim[0], dim[1], dim[2], dim[3], dim[4]}, dtype::Int32()},
  404. permutation);
  405. // if (!bench_tensor<T>(dim, permutation)) return false;
  406. }
  407. }
  408. }
  409. TEST_F(CUDA, BENCHMARK_RELAYOUT_NCHW_NCHW4) {
  410. static constexpr size_t RUNS = 10;
  411. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  412. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  413. benchmarker.set_times(RUNS);
  414. TensorLayout src = layout.dimshuffle(per);
  415. TensorLayout dst = layout;
  416. dst.init_contiguous_stride();
  417. auto used = benchmarker.execl({src, dst});
  418. Checker<Relayout> checker(handle_cuda());
  419. checker.exec(TensorLayoutArray{src, dst});
  420. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  421. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  422. (1024 * 1024 * 1024));
  423. };
  424. UniformIntRNG u(2, 100);
  425. printf("NCHW->NCHW4\n");
  426. for (int i = 0; i < 20; i++) {
  427. int d1 = u.gen_single_val();
  428. int d2 = (u.gen_single_val() / 4 + 1) * 4;
  429. int d3 = 4;
  430. // int d4=(u.gen_single_val()/4+1)*4;
  431. int d4 = (u.gen_single_val());
  432. int d5 = (u.gen_single_val());
  433. // int d5=(u.gen_single_val()/4+1)*4;
  434. // int d5 = (u.gen_single_val())*2+1;
  435. run({{(size_t)d1, (size_t)d2 / 4, (size_t)d3, (size_t)d4, (size_t)d5},
  436. {d2 * d3 * d4 * d5 / 4, d3 * d4 * d5, d4 * d5, d5, 1},
  437. dtype::Int8()},
  438. {0, 1, 3, 4, 2});
  439. }
  440. printf("\n\nNCHW4->NCHW\n");
  441. for (int i = 0; i < 20; i++) {
  442. int d1 = u.gen_single_val();
  443. int d2 = (u.gen_single_val() / 4 + 1) * 4;
  444. int d3 = u.gen_single_val();
  445. // int d5=(u.gen_single_val()/4+1)*4;
  446. int d4 = u.gen_single_val();
  447. int d5 = 4;
  448. run({{(size_t)d1, (size_t)d2 / 4, (size_t)d3, (size_t)d4, (size_t)d5},
  449. {d2 * d3 * d4 * d5 / 4, d3 * d4 * d5, d4 * d5, d5, 1},
  450. dtype::Int8()},
  451. {0, 1, 4, 2, 3});
  452. }
  453. }
  454. TEST_F(CUDA, BENCHMARK_RELAYOUT_NCHW4_NCHW32) {
  455. static constexpr size_t RUNS = 10;
  456. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  457. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  458. benchmarker.set_times(RUNS);
  459. TensorLayout src = layout.dimshuffle(per);
  460. TensorLayout dst = layout;
  461. dst.init_contiguous_stride();
  462. auto used = benchmarker.execl({src, dst});
  463. Checker<Relayout> checker(handle_cuda());
  464. checker.exec(TensorLayoutArray{src, dst});
  465. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  466. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  467. (1024 * 1024 * 1024));
  468. };
  469. UniformIntRNG u(4, 50);
  470. printf("NCHW4 to NCHW32\n");
  471. for (int i = 0; i < 20; i++) {
  472. int d1 = u.gen_single_val();
  473. int d2 = (u.gen_single_val() / 8 + 1) * 8;
  474. int d3 = 8;
  475. int d4 = u.gen_single_val();
  476. int d5 = u.gen_single_val();
  477. int d6 = 4;
  478. run({{(size_t)d1, (size_t)d2 / 8, (size_t)d3, (size_t)d4, (size_t)d5,
  479. (size_t)d6},
  480. {d2 * d3 * d4 * d5 * d6 / 8, d3 * d4 * d5 * d6, d4 * d5 * d6, d5 * d6, d6,
  481. 1},
  482. dtype::Int8()},
  483. {0, 1, 3, 4, 2, 5});
  484. }
  485. printf("\n\nNCHW32 to NCHW4\n");
  486. for (int i = 0; i < 20; i++) {
  487. int d1 = u.gen_single_val();
  488. int d2 = (u.gen_single_val() / 8 + 1) * 8;
  489. int d3 = u.gen_single_val();
  490. int d4 = u.gen_single_val();
  491. int d5 = 8;
  492. int d6 = 4;
  493. run({{(size_t)d1, (size_t)d2 / 8, (size_t)d3, (size_t)d4, (size_t)d5,
  494. (size_t)d6},
  495. {d2 * d3 * d4 * d5 * d6 / 8, d3 * d4 * d5 * d6, d4 * d5 * d6, d5 * d6, d6,
  496. 1},
  497. dtype::Int8()},
  498. {0, 1, 4, 2, 3, 5});
  499. }
  500. }
  501. TEST_F(CUDA, BENCHMARK_LAST_CONTIG_ALIGN_TEST) {
  502. static constexpr size_t RUNS = 10;
  503. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  504. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  505. benchmarker.set_times(RUNS);
  506. TensorLayout src = layout.dimshuffle(per);
  507. TensorLayout dst = layout;
  508. // std::swap(dst.shape[0], dst.shape[1]);
  509. dst.init_contiguous_stride();
  510. auto used = benchmarker.execl({src, dst});
  511. Checker<Relayout> checker(handle_cuda());
  512. checker.exec(TensorLayoutArray{src, dst});
  513. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  514. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  515. (1024 * 1024 * 1024));
  516. };
  517. UniformIntRNG u(4, 50);
  518. std::vector<size_t> _dim(6);
  519. std::vector<size_t> permutation(_dim.size());
  520. for (size_t r = 0; r < _dim.size(); r++) {
  521. size_t size = _dim.size();
  522. permutation[r] = size - 1 - r;
  523. }
  524. _dim[0] = u.gen_single_val();
  525. _dim[1] = u.gen_single_val();
  526. _dim[2] = u.gen_single_val();
  527. _dim[3] = u.gen_single_val();
  528. _dim[4] = u.gen_single_val();
  529. _dim[5] = (u.gen_single_val() / 4 + 1) * 4;
  530. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5]}, dtype::Int8()},
  531. permutation);
  532. // Random
  533. for (size_t r = 0; r < _dim.size(); r++)
  534. permutation[r] = r;
  535. for (int nsample = 0; nsample < 20; nsample++) {
  536. COMPAT_RANDOM(_dim.begin(), _dim.end() - 1);
  537. COMPAT_RANDOM(permutation.begin(), permutation.end() - 1);
  538. if (nsample < 5)
  539. _dim[5] = (u.gen_single_val() / 4 + 1) * 4;
  540. else
  541. _dim[5] = u.gen_single_val();
  542. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5]}, dtype::Int8()},
  543. permutation);
  544. }
  545. }
  546. #endif
  547. TEST_F(CUDA, RELAYOUT) {
  548. struct Arg {
  549. TensorLayout src, dst;
  550. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  551. };
  552. std::vector<Arg> args;
  553. {
  554. // contiguous stride
  555. args.emplace_back(
  556. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()),
  557. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()));
  558. args.emplace_back(
  559. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()),
  560. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()));
  561. args.emplace_back(
  562. TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Float16()),
  563. TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Float16()));
  564. }
  565. args.emplace_back(
  566. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()),
  567. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()));
  568. args.emplace_back(
  569. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  570. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()));
  571. args.emplace_back(
  572. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  573. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Float16()));
  574. args.emplace_back(
  575. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()),
  576. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()));
  577. args.emplace_back(
  578. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  579. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()));
  580. args.emplace_back(
  581. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  582. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int32()));
  583. args.emplace_back(
  584. TensorLayout({16, 128, 128}, {49152, 384, 3}, dtype::Float32()),
  585. TensorLayout({16, 128, 128}, {16384, 128, 1}, dtype::Float32()));
  586. {
  587. // 1d
  588. size_t n = 10000;
  589. args.emplace_back(
  590. TensorLayout({n}, {1}, dtype::Int32()),
  591. TensorLayout({n}, {1}, dtype::Int32()));
  592. args.emplace_back(
  593. TensorLayout({n}, {1}, dtype::Int32()),
  594. TensorLayout({n}, {2}, dtype::Int32()));
  595. args.emplace_back(
  596. TensorLayout({n}, {2}, dtype::Int32()),
  597. TensorLayout({n}, {1}, dtype::Int32()));
  598. args.emplace_back(
  599. TensorLayout({n}, {2}, dtype::Int32()),
  600. TensorLayout({n}, {2}, dtype::Int32()));
  601. }
  602. {
  603. // 2d
  604. size_t m = 200, n = 300, k = 400;
  605. ptrdiff_t k2 = k * 2;
  606. args.emplace_back(
  607. TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  608. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  609. args.emplace_back(
  610. TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  611. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  612. args.emplace_back(
  613. TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  614. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  615. args.emplace_back(
  616. TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  617. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  618. args.emplace_back(
  619. TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  620. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  621. args.emplace_back(
  622. TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  623. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  624. args.emplace_back(
  625. TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  626. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  627. args.emplace_back(
  628. TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  629. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  630. }
  631. {
  632. // 3d
  633. size_t m = 20, n = 30, k = 40;
  634. ptrdiff_t k2 = k;
  635. args.emplace_back(
  636. TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()),
  637. TensorLayout(
  638. {m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()));
  639. }
  640. {
  641. // simplify_layout
  642. // 234..56
  643. // 2..3456
  644. args.emplace_back(
  645. TensorLayout(
  646. {2, 3, 4, 5, 6},
  647. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  648. dtype::Int32()),
  649. TensorLayout(
  650. {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  651. dtype::Int32()));
  652. }
  653. Checker<Relayout> checker(handle_cuda());
  654. for (auto&& arg : args) {
  655. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  656. }
  657. }
  658. TEST_F(CUDA, TRANSPOSE_INT8) {
  659. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  660. TensorLayout src = layout.dimshuffle(per);
  661. TensorLayout dst = layout;
  662. dst.init_contiguous_stride();
  663. Checker<Relayout> checker(handle_cuda());
  664. checker.exec(TensorLayoutArray{src, dst});
  665. };
  666. //! for last contig(NCHW4<->NCHW32)
  667. run({{5, 8, 4, 3, 8}, dtype::Int8()}, {1, 3, 0, 2, 4});
  668. run({{5, 8, 4, 3, 5}, dtype::Int8()}, {1, 3, 0, 2, 4});
  669. run({{5, 8, 4, 3, 64}, dtype::Int8()}, {1, 3, 0, 2, 4});
  670. //! for last no contig(NCHW->NCHW4)
  671. run({{7, 4, 32}, dtype::Int8()}, {2, 0, 1});
  672. run({{7, 4, 64}, dtype::Int8()}, {2, 0, 1});
  673. run({{7, 4, 7}, dtype::Int8()}, {2, 0, 1});
  674. //! for copy
  675. run({{2, 3, 4, 5, 6},
  676. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  677. dtype::Int8()},
  678. {0, 1, 2, 3, 4});
  679. }
  680. TEST_F(CUDA, RELAYOUT_INT8) {
  681. struct Arg {
  682. TensorLayout src, dst;
  683. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  684. };
  685. std::vector<Arg> args;
  686. {
  687. // contiguous stride
  688. args.emplace_back(
  689. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Int8()),
  690. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Int8()));
  691. args.emplace_back(
  692. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Int8()),
  693. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Int8()));
  694. args.emplace_back(
  695. TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Int8()),
  696. TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Int8()));
  697. }
  698. args.emplace_back(
  699. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()),
  700. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()));
  701. args.emplace_back(
  702. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  703. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()));
  704. args.emplace_back(
  705. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  706. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int8()));
  707. args.emplace_back(
  708. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()),
  709. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()));
  710. args.emplace_back(
  711. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  712. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()));
  713. args.emplace_back(
  714. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  715. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int8()));
  716. args.emplace_back(
  717. TensorLayout({16, 128, 128}, {49152, 384, 3}, dtype::Int8()),
  718. TensorLayout({16, 128, 128}, {16384, 128, 1}, dtype::Int8()));
  719. {
  720. // 1d
  721. size_t n = 10000;
  722. args.emplace_back(
  723. TensorLayout({n}, {1}, dtype::Int8()),
  724. TensorLayout({n}, {1}, dtype::Int8()));
  725. args.emplace_back(
  726. TensorLayout({n}, {1}, dtype::Int8()),
  727. TensorLayout({n}, {2}, dtype::Int8()));
  728. args.emplace_back(
  729. TensorLayout({n}, {2}, dtype::Int8()),
  730. TensorLayout({n}, {1}, dtype::Int8()));
  731. args.emplace_back(
  732. TensorLayout({n}, {2}, dtype::Int8()),
  733. TensorLayout({n}, {2}, dtype::Int8()));
  734. }
  735. {
  736. // 2d
  737. size_t m = 200, n = 300, k = 400;
  738. ptrdiff_t k2 = k * 2;
  739. args.emplace_back(
  740. TensorLayout({m, n}, {k2, 2}, dtype::Int8()),
  741. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int8()));
  742. args.emplace_back(
  743. TensorLayout({m, n}, {2, k2}, dtype::Int8()),
  744. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int8()));
  745. args.emplace_back(
  746. TensorLayout({m, n}, {2, k2}, dtype::Int8()),
  747. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int8()));
  748. args.emplace_back(
  749. TensorLayout({m, n}, {k2, 2}, dtype::Int8()),
  750. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int8()));
  751. args.emplace_back(
  752. TensorLayout({m, n}, {k2, 1}, dtype::Int8()),
  753. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int8()));
  754. args.emplace_back(
  755. TensorLayout({m, n}, {1, k2}, dtype::Int8()),
  756. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int8()));
  757. args.emplace_back(
  758. TensorLayout({m, n}, {1, k2}, dtype::Int8()),
  759. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int8()));
  760. args.emplace_back(
  761. TensorLayout({m, n}, {k2, 1}, dtype::Int8()),
  762. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int8()));
  763. }
  764. {
  765. // 3d
  766. size_t m = 20, n = 30, k = 40;
  767. ptrdiff_t k2 = k;
  768. args.emplace_back(
  769. TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2}, dtype::Int8()),
  770. TensorLayout(
  771. {m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2}, dtype::Int8()));
  772. }
  773. {
  774. // simplify_layout
  775. // 234..56
  776. // 2..3456
  777. args.emplace_back(
  778. TensorLayout(
  779. {2, 3, 4, 5, 6},
  780. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  781. dtype::Int8()),
  782. TensorLayout(
  783. {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  784. dtype::Int8()));
  785. args.emplace_back(
  786. TensorLayout(
  787. {2, 3, 4, 5, 6},
  788. {4 * 3 * 4 * 5 * 6, 4 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  789. dtype::Int8()),
  790. TensorLayout(
  791. {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  792. dtype::Int8()));
  793. }
  794. Checker<Relayout> checker(handle_cuda());
  795. for (auto&& arg : args) {
  796. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  797. }
  798. }
  799. TEST_F(CUDA, RELAYOUT_TEST) {
  800. struct Arg {
  801. TensorLayout src, dst;
  802. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  803. };
  804. std::vector<Arg> args;
  805. //! dst contig
  806. args.emplace_back(
  807. TensorLayout({5, 32, 9}, {288, 1, 32}, dtype::Int8()),
  808. TensorLayout({5, 9, 32}, {288, 32, 1}, dtype::Int8()));
  809. args.emplace_back(
  810. TensorLayout({5, 9, 32}, {288, 1, 9}, dtype::Int8()),
  811. TensorLayout({5, 32, 9}, {288, 9, 1}, dtype::Int8()));
  812. args.emplace_back(
  813. TensorLayout({5, 4, 9}, {36, 1, 4}, dtype::Int8()),
  814. TensorLayout({5, 9, 4}, {36, 4, 1}, dtype::Int8()));
  815. args.emplace_back(
  816. TensorLayout({5, 9, 4}, {36, 1, 9}, dtype::Int8()),
  817. TensorLayout({5, 4, 9}, {36, 9, 1}, dtype::Int8()));
  818. args.emplace_back(
  819. TensorLayout({5, 32, 4}, {128, 1, 32}, dtype::Int8()),
  820. TensorLayout({5, 4, 32}, {128, 32, 1}, dtype::Int8()));
  821. args.emplace_back(
  822. TensorLayout({5, 4, 32}, {128, 1, 4}, dtype::Int8()),
  823. TensorLayout({5, 32, 4}, {128, 4, 1}, dtype::Int8()));
  824. args.emplace_back(
  825. TensorLayout({5, 7, 5}, {35, 1, 7}, dtype::Int8()),
  826. TensorLayout({5, 5, 7}, {35, 7, 1}, dtype::Int8()));
  827. args.emplace_back(
  828. TensorLayout({5, 5, 7}, {35, 1, 5}, dtype::Int8()),
  829. TensorLayout({5, 7, 5}, {35, 5, 1}, dtype::Int8()));
  830. //! src contig
  831. args.emplace_back(
  832. TensorLayout({5, 9, 32}, {288, 32, 1}, dtype::Int8()),
  833. TensorLayout({5, 32, 9}, {288, 1, 32}, dtype::Int8()));
  834. args.emplace_back(
  835. TensorLayout({5, 32, 9}, {288, 9, 1}, dtype::Int8()),
  836. TensorLayout({5, 9, 32}, {288, 1, 9}, dtype::Int8()));
  837. args.emplace_back(
  838. TensorLayout({5, 9, 4}, {36, 4, 1}, dtype::Int8()),
  839. TensorLayout({5, 4, 9}, {36, 1, 4}, dtype::Int8()));
  840. args.emplace_back(
  841. TensorLayout({5, 4, 9}, {36, 9, 1}, dtype::Int8()),
  842. TensorLayout({5, 9, 4}, {36, 1, 9}, dtype::Int8()));
  843. args.emplace_back(
  844. TensorLayout({5, 4, 32}, {128, 32, 1}, dtype::Int8()),
  845. TensorLayout({5, 32, 4}, {128, 1, 32}, dtype::Int8()));
  846. args.emplace_back(
  847. TensorLayout({5, 32, 4}, {128, 4, 1}, dtype::Int8()),
  848. TensorLayout({5, 4, 32}, {128, 1, 4}, dtype::Int8()));
  849. args.emplace_back(
  850. TensorLayout({5, 5, 7}, {35, 7, 1}, dtype::Int8()),
  851. TensorLayout({5, 7, 5}, {35, 1, 7}, dtype::Int8()));
  852. args.emplace_back(
  853. TensorLayout({5, 7, 5}, {35, 5, 1}, dtype::Int8()),
  854. TensorLayout({5, 5, 7}, {35, 1, 5}, dtype::Int8()));
  855. //! cross
  856. args.emplace_back(
  857. TensorLayout({5, 9, 32}, {288 * 4, 32 * 3, 1}, dtype::Int8()),
  858. TensorLayout({5, 32, 9}, {288 * 4, 1, 32 * 3}, dtype::Int8()));
  859. args.emplace_back(
  860. TensorLayout({5, 32, 9}, {288 * 3, 9 * 2, 1}, dtype::Int8()),
  861. TensorLayout({5, 9, 32}, {288 * 3, 1, 9 * 2}, dtype::Int8()));
  862. args.emplace_back(
  863. TensorLayout({5, 9, 4}, {36 * 10, 4 * 7, 1}, dtype::Int8()),
  864. TensorLayout({5, 4, 9}, {36 * 10, 1, 4 * 7}, dtype::Int8()));
  865. Checker<Relayout> checker(handle_cuda());
  866. for (auto&& arg : args) {
  867. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  868. }
  869. }
  870. TEST_F(CUDA, RELAYOUT_Q4) {
  871. Checker<Relayout> checker(handle_cuda());
  872. UniformIntRNG rng_int4{-7, 7};
  873. checker.set_rng(0, &rng_int4)
  874. .set_rng(1, &rng_int4)
  875. .set_dtype(0, dtype::QuantizedS4(1.f))
  876. .set_dtype(1, dtype::QuantizedS4(1.f))
  877. .execs({{2, 2, 1, 1}, {1, 1, 2, 2}})
  878. .execs({{1, 64, 15, 15}, {1, 15, 15, 64}})
  879. .execs({{1, 5, 9, 32}, {1, 5, 32, 9}})
  880. .execl(TensorLayoutArray{
  881. {{6400}, {1}, dtype::QuantizedS4{1.f}},
  882. {{20, 320}, {1024, 1}, dtype::QuantizedS4{1.f}}})
  883. .execl(TensorLayoutArray{
  884. {{1200, 3}, {4, 1}, dtype::QuantizedS4{1.f}},
  885. {{20, 60, 3}, {256, 4, 1}, dtype::QuantizedS4{1.f}}})
  886. .execl(TensorLayoutArray{
  887. {{20, 20, 3, 3}, {256, 12, 4, 1}, dtype::QuantizedS4{1.f}},
  888. {{1200, 3}, {4, 1}, dtype::QuantizedS4{1.f}}})
  889. .execl(TensorLayoutArray{
  890. {{5, 16, 7, 7, 4}, {3136, 196, 28, 4, 1}, dtype::QuantizedS4{1.f}},
  891. {{5, 16, 7, 7, 4}, {3136, 4, 448, 64, 1}, dtype::QuantizedS4{1.f}}})
  892. .execl(TensorLayoutArray{
  893. {{5, 7, 7, 16, 4}, {3136, 448, 64, 4, 1}, dtype::QuantizedS4{1.f}},
  894. {{5, 7, 7, 16, 4}, {3136, 28, 4, 196, 1}, dtype::QuantizedS4{1.f}}})
  895. .execl(TensorLayoutArray{
  896. {{5, 2, 7, 7, 32},
  897. {3136, 1568, 224, 32, 1},
  898. dtype::QuantizedS4{1.f}},
  899. {{5, 2, 7, 7, 32},
  900. {3136, 32, 448, 64, 1},
  901. dtype::QuantizedS4{1.f}}})
  902. .execl(TensorLayoutArray{
  903. {{5, 7, 7, 2, 32}, {3136, 448, 64, 32, 1}, dtype::QuantizedS4{1.f}},
  904. {{5, 7, 7, 2, 32},
  905. {3136, 224, 32, 1568, 1},
  906. dtype::QuantizedS4{1.f}}});
  907. }
  908. // vim: syntax=cpp.doxygen