You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout.cpp 40 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990
  1. /**
  2. * \file dnn/test/cuda/relayout.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/common/relayout.h"
  12. #include "megdnn/oprs.h"
  13. #include "test/common/checker.h"
  14. #include "test/common/rng.h"
  15. #include "test/cuda/benchmark.h"
  16. #include "test/cuda/fixture.h"
  17. using namespace megdnn;
  18. using namespace test;
  19. namespace {
  20. template <typename tag>
  21. class CUDA_RELAYOUT : public CUDA {};
  22. TYPED_TEST_CASE(CUDA_RELAYOUT, relayout::test_types);
  23. TYPED_TEST(CUDA_RELAYOUT, run) {
  24. relayout::run_test<TypeParam>(this->handle_cuda());
  25. }
  26. } // namespace
  27. TEST_F(CUDA, RELAYOUT_TRANSPOSE) {
  28. Checker<Relayout> checker(handle_cuda());
  29. auto run = [&](size_t batch, size_t m, size_t n, size_t c, DType dtype) {
  30. checker.set_dtype(0, dtype).set_dtype(1, dtype);
  31. TensorLayout src = {{batch, m, n, c}, dtype};
  32. src.init_contiguous_stride();
  33. TensorLayout dst = {{batch, m, n, c}, dtype};
  34. dst.stride[0] = m * n * c;
  35. dst.stride[1] = c;
  36. dst.stride[2] = m * c;
  37. dst.stride[3] = 1;
  38. checker.execl({src, dst});
  39. };
  40. run(16, 30, 40, 4, dtype::Int8());
  41. run(16, 20, 10, 4, dtype::Int8());
  42. run(1, 30, 20, 1, dtype::Int32());
  43. run(1, 20, 30, 1, dtype::Int32());
  44. run(1, 11, 21, 1, dtype::Float32());
  45. }
  46. #if MEGDNN_WITH_BENCHMARK
  47. TEST_F(CUDA, BENCHMARK_RELAYOUT_TRANSPOSE) {
  48. static constexpr size_t RUNS = 1000;
  49. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  50. benchmarker.set_times(RUNS);
  51. auto run = [&](size_t batch, size_t m, size_t n, size_t c, DType dtype) {
  52. benchmarker.set_dtype(0, dtype).set_dtype(1, dtype);
  53. TensorLayout src = {{batch, m, n, c}, dtype};
  54. src.init_contiguous_stride();
  55. TensorLayout dst = {{batch, m, n, c}, dtype};
  56. dst.stride[0] = m * n * c;
  57. dst.stride[1] = c;
  58. dst.stride[2] = m * c;
  59. dst.stride[3] = 1;
  60. auto time_ms = benchmarker.execl({src, dst}) / RUNS;
  61. printf("{%zux%zux%zux%zu}->{%zux%zux%zux%zu} bandwidth: %.2f gbps\n", batch, m,
  62. n, c, batch, n, m, c,
  63. 2.f * batch * m * n * c * dtype.size() / (1e6 * time_ms));
  64. };
  65. run(16, 640, 480, 4, dtype::Int8());
  66. run(256, 224, 224, 4, dtype::Int8());
  67. run(1, 256, 224 * 224, 1, dtype::Int32());
  68. run(1, 256, 7 * 7 * 512, 1, dtype::Int32());
  69. run(1, 4096, 4096, 1, dtype::Float32());
  70. }
  71. TEST_F(CUDA, BENCHMARK_RELAYOUT) {
  72. //! benchmark contious layout, such as (a, b, c, d) -> (b, a, c,d)
  73. //! just change the first two axis
  74. static constexpr size_t RUNS = 3;
  75. auto run = [&](const TensorLayoutArray& layouts) {
  76. Benchmarker<Relayout> benchmarker(handle_cuda());
  77. benchmarker.set_times(RUNS);
  78. for (auto&& layout : layouts) {
  79. TensorLayout src = layout.dimshuffle({1, 0, 2});
  80. TensorLayout dst = layout;
  81. std::swap(dst.shape[0], dst.shape[1]);
  82. dst.init_contiguous_stride();
  83. auto used = benchmarker.execl({src, dst});
  84. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  85. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  86. 1000 / (1024 * 1024 * 1024));
  87. }
  88. };
  89. TensorLayoutArray layouts = {
  90. {{12, 23, 2}, dtype::Int32()}, {{12, 23, 8}, dtype::Int32()},
  91. {{12, 23, 17}, dtype::Int32()}, {{12, 23, 64}, dtype::Int32()},
  92. {{12, 23, 129}, dtype::Int32()}, {{12, 23, 256}, dtype::Int32()},
  93. {{12, 23, 1029}, dtype::Int32()}, {{12, 23, 4096}, dtype::Int32()},
  94. {{12, 23, 9143}, dtype::Int32()}, {{12, 23, 18284}, dtype::Int32()},
  95. {{2, 2, 1000000}, dtype::Int32()},
  96. };
  97. run(layouts);
  98. auto run2 = [&](const TensorLayoutArray& layouts) {
  99. Benchmarker<Relayout> benchmarker(handle_cuda());
  100. benchmarker.set_times(RUNS);
  101. for (auto&& layout : layouts) {
  102. TensorLayout src = layout.dimshuffle({0, 2, 1, 3});
  103. TensorLayout dst = layout;
  104. std::swap(dst.shape[0], dst.shape[1]);
  105. dst.init_contiguous_stride();
  106. auto used = benchmarker.execl({src, dst});
  107. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  108. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  109. 1000 / (1024 * 1024 * 1024));
  110. }
  111. };
  112. layouts = {
  113. {{3, 12, 24, 100}, dtype::Int32()},
  114. {{3, 12, 24, 1029}, dtype::Int32()},
  115. {{3, 4, 24, 9143}, dtype::Int32()},
  116. {{3, 4, 24, 18284}, dtype::Int32()},
  117. };
  118. run2(layouts);
  119. }
  120. TEST_F(CUDA, BENCHMARK_RELAYOUT_LAST_CONTIG) {
  121. //! src and dst are all get subtensor in channel axis
  122. static constexpr size_t RUNS = 3;
  123. Benchmarker<Relayout> benchmarker(handle_cuda());
  124. benchmarker.set_times(RUNS);
  125. TensorLayout src =
  126. TensorLayout({5, 5, 100000}, {800000, 100000, 1}, dtype::Float32());
  127. TensorLayout dst =
  128. TensorLayout({5, 5, 100000}, {700000, 100000, 1}, dtype::Float32());
  129. auto used = benchmarker.execl({src, dst});
  130. printf("src: %s dst: %s bandwith: %f gbps/s\n", src.to_string().c_str(),
  131. dst.to_string().c_str(),
  132. 2 * src.total_nr_elems() * src.dtype.size() * RUNS / used * 1000 /
  133. (1024 * 1024 * 1024));
  134. }
  135. TEST_F(CUDA, BENCHMARK_RELAYOUT_LAST_NOT_CONTIG) {
  136. static constexpr size_t RUNS = 3;
  137. auto run = [&](TensorLayout src, TensorLayout dst) {
  138. Benchmarker<Relayout> benchmarker(handle_cuda());
  139. auto&& layout = src;
  140. benchmarker.set_times(RUNS);
  141. dst.init_contiguous_stride();
  142. auto used = benchmarker.execl({src, dst});
  143. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  144. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  145. (1024 * 1024 * 1024));
  146. };
  147. run({{16, 128, 128}, {49152, 384, 3}, dtype::Float32()},
  148. {{16, 128, 128}, {16384, 128, 1}, dtype::Float32()});
  149. }
  150. TEST_F(CUDA, BENCHMARK_RELAYOUT_6) {
  151. static constexpr size_t RUNS = 3;
  152. auto run = [&](TensorLayoutArray layouts,
  153. std::vector<std::vector<size_t>> permutations) {
  154. Benchmarker<Relayout> benchmarker(handle_cuda());
  155. benchmarker.set_times(RUNS);
  156. int i = 0;
  157. for (auto&& layout : layouts) {
  158. auto per = permutations[i];
  159. TensorLayout src = layout.dimshuffle(per);
  160. TensorLayout dst = layout;
  161. std::swap(dst.shape[0], dst.shape[1]);
  162. dst.init_contiguous_stride();
  163. auto used = benchmarker.execl({src, dst});
  164. Checker<Relayout> checker(handle_cuda());
  165. checker.exec(TensorLayoutArray{src, dst});
  166. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  167. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  168. 1000 / (1024 * 1024 * 1024));
  169. i++;
  170. }
  171. };
  172. TensorLayoutArray layouts = {
  173. {{7248, 7248}, dtype::Int32()},
  174. {{43408, 1216}, dtype::Int32()},
  175. {{1216, 43408}, dtype::Int32()},
  176. {{368, 384, 384}, dtype::Int32()},
  177. {{2144, 64, 384}, dtype::Int32()},
  178. {{368, 64, 2307}, dtype::Int32()},
  179. {{384, 384, 355}, dtype::Int32()},
  180. {{2320, 384, 59}, dtype::Int32()},
  181. {{384, 2320, 59}, dtype::Int32()},
  182. {{384, 355, 384}, dtype::Int32()},
  183. {{2320, 59, 384}, dtype::Int32()},
  184. {{384, 59, 2320}, dtype::Int32()},
  185. {{80, 96, 75, 96}, dtype::Int32()},
  186. {{464, 16, 75, 96}, dtype::Int32()},
  187. {{80, 16, 75, 582}, dtype::Int32()},
  188. {{96, 75, 96, 75}, dtype::Int32()},
  189. {{608, 12, 96, 75}, dtype::Int32()},
  190. {{96, 12, 608, 75}, dtype::Int32()},
  191. {{96, 75, 96, 75}, dtype::Int32()},
  192. {{608, 12, 96, 75}, dtype::Int32()},
  193. {{96, 12, 608, 75}, dtype::Int32()},
  194. {{96, 96, 75, 75}, dtype::Int32()},
  195. {{608, 96, 12, 75}, dtype::Int32()},
  196. {{96, 608, 12, 75}, dtype::Int32()},
  197. {{96, 75, 75, 96}, dtype::Int32()},
  198. {{608, 12, 75, 96}, dtype::Int32()},
  199. {{96, 12, 75, 608}, dtype::Int32()},
  200. {{32, 48, 28, 28, 48}, dtype::Int32()},
  201. {{176, 8, 28, 28, 48}, dtype::Int32()},
  202. {{32, 8, 28, 28, 298}, dtype::Int32()},
  203. {{48, 28, 28, 48, 28}, dtype::Int32()},
  204. {{352, 4, 28, 48, 28}, dtype::Int32()},
  205. {{48, 4, 28, 352, 28}, dtype::Int32()},
  206. {{48, 28, 48, 28, 28}, dtype::Int32()},
  207. {{352, 4, 48, 28, 28}, dtype::Int32()},
  208. {{48, 4, 352, 28, 28}, dtype::Int32()},
  209. {{48, 48, 28, 28, 28}, dtype::Int32()},
  210. {{352, 48, 4, 28, 28}, dtype::Int32()},
  211. {{48, 352, 4, 28, 28}, dtype::Int32()},
  212. {{48, 28, 28, 28, 48}, dtype::Int32()},
  213. {{352, 4, 28, 28, 48}, dtype::Int32()},
  214. {{48, 4, 28, 28, 352}, dtype::Int32()},
  215. {{16, 32, 15, 32, 15, 15}, dtype::Int32()},
  216. {{48, 10, 15, 32, 15, 15}, dtype::Int32()},
  217. {{16, 10, 15, 103, 15, 15}, dtype::Int32()},
  218. {{32, 15, 15, 32, 15, 15}, dtype::Int32()},
  219. {{112, 5, 15, 32, 15, 15}, dtype::Int32()},
  220. {{32, 5, 15, 112, 15, 15}, dtype::Int32()},
  221. {{32, 15, 32, 15, 15, 15}, dtype::Int32()},
  222. {{112, 5, 32, 15, 15, 15}, dtype::Int32()},
  223. {{32, 5, 112, 15, 15, 15}, dtype::Int32()},
  224. {{32, 15, 15, 32, 15, 15}, dtype::Int32()},
  225. {{112, 5, 15, 32, 15, 15}, dtype::Int32()},
  226. {{32, 5, 15, 112, 15, 15}, dtype::Int32()},
  227. {{32, 15, 15, 15, 15, 32}, dtype::Int32()},
  228. {{112, 5, 15, 15, 15, 32}, dtype::Int32()},
  229. {{32, 5, 15, 15, 15, 112}, dtype::Int32()},
  230. };
  231. std::vector<std::vector<size_t>> permutations = {
  232. std::vector<size_t>{1, 0},
  233. std::vector<size_t>{1, 0},
  234. std::vector<size_t>{1, 0},
  235. std::vector<size_t>{0, 2, 1},
  236. std::vector<size_t>{0, 2, 1},
  237. std::vector<size_t>{0, 2, 1},
  238. std::vector<size_t>{1, 0, 2},
  239. std::vector<size_t>{1, 0, 2},
  240. std::vector<size_t>{1, 0, 2},
  241. std::vector<size_t>{2, 1, 0},
  242. std::vector<size_t>{2, 1, 0},
  243. std::vector<size_t>{2, 1, 0},
  244. std::vector<size_t>{0, 3, 2, 1},
  245. std::vector<size_t>{0, 3, 2, 1},
  246. std::vector<size_t>{0, 3, 2, 1},
  247. std::vector<size_t>{2, 1, 3, 0},
  248. std::vector<size_t>{2, 1, 3, 0},
  249. std::vector<size_t>{2, 1, 3, 0},
  250. std::vector<size_t>{2, 0, 3, 1},
  251. std::vector<size_t>{2, 0, 3, 1},
  252. std::vector<size_t>{2, 0, 3, 1},
  253. std::vector<size_t>{1, 0, 3, 2},
  254. std::vector<size_t>{1, 0, 3, 2},
  255. std::vector<size_t>{1, 0, 3, 2},
  256. std::vector<size_t>{3, 2, 1, 0},
  257. std::vector<size_t>{3, 2, 1, 0},
  258. std::vector<size_t>{3, 2, 1, 0},
  259. std::vector<size_t>{0, 4, 2, 1, 3},
  260. std::vector<size_t>{0, 4, 2, 1, 3},
  261. std::vector<size_t>{0, 4, 2, 1, 3},
  262. std::vector<size_t>{3, 2, 1, 4, 0},
  263. std::vector<size_t>{3, 2, 1, 4, 0},
  264. std::vector<size_t>{3, 2, 1, 4, 0},
  265. std::vector<size_t>{2, 0, 4, 1, 3},
  266. std::vector<size_t>{2, 0, 4, 1, 3},
  267. std::vector<size_t>{2, 0, 4, 1, 3},
  268. std::vector<size_t>{1, 3, 0, 4, 2},
  269. std::vector<size_t>{1, 3, 0, 4, 2},
  270. std::vector<size_t>{1, 3, 0, 4, 2},
  271. std::vector<size_t>{4, 3, 2, 1, 0},
  272. std::vector<size_t>{4, 3, 2, 1, 0},
  273. std::vector<size_t>{4, 3, 2, 1, 0},
  274. std::vector<size_t>{0, 3, 2, 5, 4, 1},
  275. std::vector<size_t>{0, 3, 2, 5, 4, 1},
  276. std::vector<size_t>{0, 3, 2, 5, 4, 1},
  277. std::vector<size_t>{3, 2, 0, 5, 1, 4},
  278. std::vector<size_t>{3, 2, 0, 5, 1, 4},
  279. std::vector<size_t>{3, 2, 0, 5, 1, 4},
  280. std::vector<size_t>{2, 0, 4, 1, 5, 3},
  281. std::vector<size_t>{2, 0, 4, 1, 5, 3},
  282. std::vector<size_t>{2, 0, 4, 1, 5, 3},
  283. std::vector<size_t>{3, 2, 5, 1, 0, 4},
  284. std::vector<size_t>{3, 2, 5, 1, 0, 4},
  285. std::vector<size_t>{3, 2, 5, 1, 0, 4},
  286. std::vector<size_t>{5, 4, 3, 2, 1, 0},
  287. std::vector<size_t>{5, 4, 3, 2, 1, 0},
  288. std::vector<size_t>{5, 4, 3, 2, 1, 0}};
  289. run(layouts, permutations);
  290. }
  291. TEST_F(CUDA, BENCHMARK_RELAYOUT_7) {
  292. static constexpr size_t RUNS = 3;
  293. auto isTrivial = [&](std::vector<size_t>& permutation) {
  294. for (size_t i = 0; i < permutation.size(); i++) {
  295. if (permutation[i] != i)
  296. return false;
  297. }
  298. return true;
  299. };
  300. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  301. Benchmarker<Relayout> benchmarker(handle_cuda());
  302. benchmarker.set_times(RUNS);
  303. TensorLayout src = layout.dimshuffle(per);
  304. TensorLayout dst = layout;
  305. std::swap(dst.shape[0], dst.shape[1]);
  306. dst.init_contiguous_stride();
  307. auto used = benchmarker.execl({src, dst});
  308. Checker<Relayout> checker(handle_cuda());
  309. checker.exec(TensorLayoutArray{src, dst});
  310. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  311. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  312. (1024 * 1024 * 1024));
  313. };
  314. std::vector<size_t> _dim = {5, 3, 2, 4, 35, 33, 37};
  315. std::vector<size_t> permutation(7);
  316. // Inverse
  317. for (size_t r = 0; r < _dim.size(); r++) {
  318. size_t size = _dim.size();
  319. permutation[r] = size - 1 - r;
  320. }
  321. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5], _dim[6]},
  322. dtype::Int32()},
  323. permutation);
  324. // Random
  325. for (size_t r = 0; r < _dim.size(); r++)
  326. permutation[r] = r;
  327. for (int nsample = 0; nsample < 50; nsample++) {
  328. COMPAT_RANDOM(_dim.begin(), _dim.end());
  329. COMPAT_RANDOM(permutation.begin(), permutation.end());
  330. if (!isTrivial(permutation)) {
  331. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5], _dim[6]},
  332. dtype::Int32()},
  333. permutation);
  334. }
  335. }
  336. }
  337. TEST_F(CUDA, BENCHMARK_RELAYOUT_5) {
  338. static constexpr size_t RUNS = 10;
  339. auto isTrivial = [&](std::vector<size_t>& permutation) {
  340. for (size_t i = 0; i < permutation.size(); i++) {
  341. if (permutation[i] != i)
  342. return false;
  343. }
  344. return true;
  345. };
  346. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  347. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  348. benchmarker.set_times(RUNS);
  349. TensorLayout src = layout.dimshuffle(per);
  350. TensorLayout dst = layout;
  351. // std::swap(dst.shape[0], dst.shape[1]);
  352. dst.init_contiguous_stride();
  353. auto used = benchmarker.execl({src, dst});
  354. Checker<Relayout> checker(handle_cuda());
  355. checker.exec(TensorLayoutArray{src, dst});
  356. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  357. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  358. (1024 * 1024 * 1024));
  359. };
  360. size_t two = 2;
  361. int ratio = 5;
  362. int numElemAvg = 1000000 * 200;
  363. UniformFloatRNG numElem_dist((double)numElemAvg, (double)numElemAvg * 0.2);
  364. for (int rank = 5; rank <= 5; rank++) {
  365. for (int iter = 0; iter < 20; iter++) {
  366. int numElem = (int)numElem_dist.gen_single_val();
  367. std::vector<size_t> dim(rank);
  368. std::vector<size_t> permutation(rank);
  369. std::vector<double> dimf(rank);
  370. double volf = 1.0;
  371. for (int r = 0; r < rank; r++) {
  372. permutation[r] = (size_t)r;
  373. dimf[r] = 1.0 + (double)r * (ratio - 1.0) / (double)(rank - 1);
  374. volf *= dimf[r];
  375. }
  376. // fprintf(stderr, "volf %lf\n", volf);
  377. double scale = pow((double)numElem / volf, 1.0 / (double)rank);
  378. // fprintf(stderr, "scale %lf\n", scale);
  379. int vol = 1;
  380. for (int r = 0; r < rank; r++) {
  381. if (r == rank - 1) {
  382. dim[r] = ratio * dim[0];
  383. } else {
  384. dim[r] = (size_t)round(dimf[r] * scale);
  385. }
  386. dim[r] = std::max(two, dim[r]);
  387. vol *= dim[r];
  388. }
  389. // fprintf(stderr, "dim[0] %lf\n", dim[0]);
  390. double cur_ratio = (double)dim[rank - 1] / (double)dim[0];
  391. double vol_re = fabs((double)(vol - numElem) / (double)numElem);
  392. // Fix dimensions if volume is off by more than 5%
  393. if (vol_re > 0.05) {
  394. size_t d = (vol < numElem) ? 1 : -1;
  395. int r = 1;
  396. while (vol_re > 0.05 && r < rank) {
  397. size_t dim_plus_d = std::max(two, dim[r] + d);
  398. vol = (vol / dim[r]) * dim_plus_d;
  399. dim[r] = dim_plus_d;
  400. vol_re = fabs((double)(vol - numElem) / (double)numElem);
  401. r++;
  402. }
  403. }
  404. size_t minDim = *(std::min_element(dim.begin(), dim.end()));
  405. size_t maxDim = *(std::max_element(dim.begin(), dim.end()));
  406. cur_ratio = (double)maxDim / (double)minDim;
  407. printf("vol %d cur_ratio %lf | %lf\n", vol, cur_ratio, vol_re);
  408. // printVec(dim);
  409. COMPAT_RANDOM(dim.begin(), dim.end());
  410. while (isTrivial(permutation)) {
  411. COMPAT_RANDOM(permutation.begin(), permutation.end());
  412. }
  413. run({{dim[0], dim[1], dim[2], dim[3], dim[4]}, dtype::Int32()},
  414. permutation);
  415. // if (!bench_tensor<T>(dim, permutation)) return false;
  416. }
  417. }
  418. }
  419. TEST_F(CUDA, BENCHMARK_RELAYOUT_NCHW_NCHW4) {
  420. static constexpr size_t RUNS = 10;
  421. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  422. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  423. benchmarker.set_times(RUNS);
  424. TensorLayout src = layout.dimshuffle(per);
  425. TensorLayout dst = layout;
  426. dst.init_contiguous_stride();
  427. auto used = benchmarker.execl({src, dst});
  428. Checker<Relayout> checker(handle_cuda());
  429. checker.exec(TensorLayoutArray{src, dst});
  430. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  431. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  432. (1024 * 1024 * 1024));
  433. };
  434. UniformIntRNG u(2, 100);
  435. printf("NCHW->NCHW4\n");
  436. for (int i = 0; i < 20; i++) {
  437. int d1 = u.gen_single_val();
  438. int d2 = (u.gen_single_val() / 4 + 1) * 4;
  439. int d3 = 4;
  440. // int d4=(u.gen_single_val()/4+1)*4;
  441. int d4 = (u.gen_single_val());
  442. int d5 = (u.gen_single_val());
  443. // int d5=(u.gen_single_val()/4+1)*4;
  444. // int d5 = (u.gen_single_val())*2+1;
  445. run({{(size_t)d1, (size_t)d2 / 4, (size_t)d3, (size_t)d4, (size_t)d5},
  446. {d2 * d3 * d4 * d5 / 4, d3 * d4 * d5, d4 * d5, d5, 1},
  447. dtype::Int8()},
  448. {0, 1, 3, 4, 2});
  449. }
  450. printf("\n\nNCHW4->NCHW\n");
  451. for (int i = 0; i < 20; i++) {
  452. int d1 = u.gen_single_val();
  453. int d2 = (u.gen_single_val() / 4 + 1) * 4;
  454. int d3 = u.gen_single_val();
  455. // int d5=(u.gen_single_val()/4+1)*4;
  456. int d4 = u.gen_single_val();
  457. int d5 = 4;
  458. run({{(size_t)d1, (size_t)d2 / 4, (size_t)d3, (size_t)d4, (size_t)d5},
  459. {d2 * d3 * d4 * d5 / 4, d3 * d4 * d5, d4 * d5, d5, 1},
  460. dtype::Int8()},
  461. {0, 1, 4, 2, 3});
  462. }
  463. }
  464. TEST_F(CUDA, BENCHMARK_RELAYOUT_NCHW4_NCHW32) {
  465. static constexpr size_t RUNS = 10;
  466. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  467. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  468. benchmarker.set_times(RUNS);
  469. TensorLayout src = layout.dimshuffle(per);
  470. TensorLayout dst = layout;
  471. dst.init_contiguous_stride();
  472. auto used = benchmarker.execl({src, dst});
  473. Checker<Relayout> checker(handle_cuda());
  474. checker.exec(TensorLayoutArray{src, dst});
  475. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  476. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  477. (1024 * 1024 * 1024));
  478. };
  479. UniformIntRNG u(4, 50);
  480. printf("NCHW4 to NCHW32\n");
  481. for (int i = 0; i < 20; i++) {
  482. int d1 = u.gen_single_val();
  483. int d2 = (u.gen_single_val() / 8 + 1) * 8;
  484. int d3 = 8;
  485. int d4 = u.gen_single_val();
  486. int d5 = u.gen_single_val();
  487. int d6 = 4;
  488. run({{(size_t)d1, (size_t)d2 / 8, (size_t)d3, (size_t)d4, (size_t)d5,
  489. (size_t)d6},
  490. {d2 * d3 * d4 * d5 * d6 / 8, d3 * d4 * d5 * d6, d4 * d5 * d6, d5 * d6, d6,
  491. 1},
  492. dtype::Int8()},
  493. {0, 1, 3, 4, 2, 5});
  494. }
  495. printf("\n\nNCHW32 to NCHW4\n");
  496. for (int i = 0; i < 20; i++) {
  497. int d1 = u.gen_single_val();
  498. int d2 = (u.gen_single_val() / 8 + 1) * 8;
  499. int d3 = u.gen_single_val();
  500. int d4 = u.gen_single_val();
  501. int d5 = 8;
  502. int d6 = 4;
  503. run({{(size_t)d1, (size_t)d2 / 8, (size_t)d3, (size_t)d4, (size_t)d5,
  504. (size_t)d6},
  505. {d2 * d3 * d4 * d5 * d6 / 8, d3 * d4 * d5 * d6, d4 * d5 * d6, d5 * d6, d6,
  506. 1},
  507. dtype::Int8()},
  508. {0, 1, 4, 2, 3, 5});
  509. }
  510. }
  511. TEST_F(CUDA, BENCHMARK_LAST_CONTIG_ALIGN_TEST) {
  512. static constexpr size_t RUNS = 10;
  513. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  514. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  515. benchmarker.set_times(RUNS);
  516. TensorLayout src = layout.dimshuffle(per);
  517. TensorLayout dst = layout;
  518. // std::swap(dst.shape[0], dst.shape[1]);
  519. dst.init_contiguous_stride();
  520. auto used = benchmarker.execl({src, dst});
  521. Checker<Relayout> checker(handle_cuda());
  522. checker.exec(TensorLayoutArray{src, dst});
  523. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  524. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
  525. (1024 * 1024 * 1024));
  526. };
  527. UniformIntRNG u(4, 50);
  528. std::vector<size_t> _dim(6);
  529. std::vector<size_t> permutation(_dim.size());
  530. for (size_t r = 0; r < _dim.size(); r++) {
  531. size_t size = _dim.size();
  532. permutation[r] = size - 1 - r;
  533. }
  534. _dim[0] = u.gen_single_val();
  535. _dim[1] = u.gen_single_val();
  536. _dim[2] = u.gen_single_val();
  537. _dim[3] = u.gen_single_val();
  538. _dim[4] = u.gen_single_val();
  539. _dim[5] = (u.gen_single_val() / 4 + 1) * 4;
  540. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5]}, dtype::Int8()},
  541. permutation);
  542. // Random
  543. for (size_t r = 0; r < _dim.size(); r++)
  544. permutation[r] = r;
  545. for (int nsample = 0; nsample < 20; nsample++) {
  546. COMPAT_RANDOM(_dim.begin(), _dim.end() - 1);
  547. COMPAT_RANDOM(permutation.begin(), permutation.end() - 1);
  548. if (nsample < 5)
  549. _dim[5] = (u.gen_single_val() / 4 + 1) * 4;
  550. else
  551. _dim[5] = u.gen_single_val();
  552. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5]}, dtype::Int8()},
  553. permutation);
  554. }
  555. }
  556. #endif
  557. TEST_F(CUDA, RELAYOUT) {
  558. struct Arg {
  559. TensorLayout src, dst;
  560. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  561. };
  562. std::vector<Arg> args;
  563. {
  564. // contiguous stride
  565. args.emplace_back(
  566. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()),
  567. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()));
  568. args.emplace_back(
  569. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()),
  570. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()));
  571. args.emplace_back(
  572. TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Float16()),
  573. TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Float16()));
  574. }
  575. args.emplace_back(
  576. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()),
  577. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()));
  578. args.emplace_back(
  579. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  580. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()));
  581. args.emplace_back(
  582. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  583. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Float16()));
  584. args.emplace_back(
  585. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()),
  586. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()));
  587. args.emplace_back(
  588. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  589. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()));
  590. args.emplace_back(
  591. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  592. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int32()));
  593. args.emplace_back(
  594. TensorLayout({16, 128, 128}, {49152, 384, 3}, dtype::Float32()),
  595. TensorLayout({16, 128, 128}, {16384, 128, 1}, dtype::Float32()));
  596. {
  597. // 1d
  598. size_t n = 10000;
  599. args.emplace_back(
  600. TensorLayout({n}, {1}, dtype::Int32()),
  601. TensorLayout({n}, {1}, dtype::Int32()));
  602. args.emplace_back(
  603. TensorLayout({n}, {1}, dtype::Int32()),
  604. TensorLayout({n}, {2}, dtype::Int32()));
  605. args.emplace_back(
  606. TensorLayout({n}, {2}, dtype::Int32()),
  607. TensorLayout({n}, {1}, dtype::Int32()));
  608. args.emplace_back(
  609. TensorLayout({n}, {2}, dtype::Int32()),
  610. TensorLayout({n}, {2}, dtype::Int32()));
  611. }
  612. {
  613. // 2d
  614. size_t m = 200, n = 300, k = 400;
  615. ptrdiff_t k2 = k * 2;
  616. args.emplace_back(
  617. TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  618. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  619. args.emplace_back(
  620. TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  621. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  622. args.emplace_back(
  623. TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  624. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  625. args.emplace_back(
  626. TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  627. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  628. args.emplace_back(
  629. TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  630. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  631. args.emplace_back(
  632. TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  633. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  634. args.emplace_back(
  635. TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  636. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  637. args.emplace_back(
  638. TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  639. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  640. }
  641. {
  642. // 3d
  643. size_t m = 20, n = 30, k = 40;
  644. ptrdiff_t k2 = k;
  645. args.emplace_back(
  646. TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()),
  647. TensorLayout(
  648. {m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()));
  649. }
  650. {
  651. // simplify_layout
  652. // 234..56
  653. // 2..3456
  654. args.emplace_back(
  655. TensorLayout(
  656. {2, 3, 4, 5, 6},
  657. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  658. dtype::Int32()),
  659. TensorLayout(
  660. {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  661. dtype::Int32()));
  662. }
  663. Checker<Relayout> checker(handle_cuda());
  664. for (auto&& arg : args) {
  665. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  666. }
  667. }
  668. TEST_F(CUDA, TRANSPOSE_INT8) {
  669. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  670. TensorLayout src = layout.dimshuffle(per);
  671. TensorLayout dst = layout;
  672. dst.init_contiguous_stride();
  673. Checker<Relayout> checker(handle_cuda());
  674. checker.exec(TensorLayoutArray{src, dst});
  675. };
  676. //! for last contig(NCHW4<->NCHW32)
  677. run({{5, 8, 4, 3, 8}, dtype::Int8()}, {1, 3, 0, 2, 4});
  678. run({{5, 8, 4, 3, 5}, dtype::Int8()}, {1, 3, 0, 2, 4});
  679. run({{5, 8, 4, 3, 64}, dtype::Int8()}, {1, 3, 0, 2, 4});
  680. //! for last no contig(NCHW->NCHW4)
  681. run({{7, 4, 32}, dtype::Int8()}, {2, 0, 1});
  682. run({{7, 4, 64}, dtype::Int8()}, {2, 0, 1});
  683. run({{7, 4, 7}, dtype::Int8()}, {2, 0, 1});
  684. //! for copy
  685. run({{2, 3, 4, 5, 6},
  686. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  687. dtype::Int8()},
  688. {0, 1, 2, 3, 4});
  689. }
  690. TEST_F(CUDA, RELAYOUT_INT8) {
  691. struct Arg {
  692. TensorLayout src, dst;
  693. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  694. };
  695. std::vector<Arg> args;
  696. {
  697. // contiguous stride
  698. args.emplace_back(
  699. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Int8()),
  700. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Int8()));
  701. args.emplace_back(
  702. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Int8()),
  703. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Int8()));
  704. args.emplace_back(
  705. TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Int8()),
  706. TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Int8()));
  707. }
  708. args.emplace_back(
  709. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()),
  710. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()));
  711. args.emplace_back(
  712. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  713. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()));
  714. args.emplace_back(
  715. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  716. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int8()));
  717. args.emplace_back(
  718. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()),
  719. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()));
  720. args.emplace_back(
  721. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  722. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()));
  723. args.emplace_back(
  724. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  725. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int8()));
  726. args.emplace_back(
  727. TensorLayout({16, 128, 128}, {49152, 384, 3}, dtype::Int8()),
  728. TensorLayout({16, 128, 128}, {16384, 128, 1}, dtype::Int8()));
  729. {
  730. // 1d
  731. size_t n = 10000;
  732. args.emplace_back(
  733. TensorLayout({n}, {1}, dtype::Int8()),
  734. TensorLayout({n}, {1}, dtype::Int8()));
  735. args.emplace_back(
  736. TensorLayout({n}, {1}, dtype::Int8()),
  737. TensorLayout({n}, {2}, dtype::Int8()));
  738. args.emplace_back(
  739. TensorLayout({n}, {2}, dtype::Int8()),
  740. TensorLayout({n}, {1}, dtype::Int8()));
  741. args.emplace_back(
  742. TensorLayout({n}, {2}, dtype::Int8()),
  743. TensorLayout({n}, {2}, dtype::Int8()));
  744. }
  745. {
  746. // 2d
  747. size_t m = 200, n = 300, k = 400;
  748. ptrdiff_t k2 = k * 2;
  749. args.emplace_back(
  750. TensorLayout({m, n}, {k2, 2}, dtype::Int8()),
  751. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int8()));
  752. args.emplace_back(
  753. TensorLayout({m, n}, {2, k2}, dtype::Int8()),
  754. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int8()));
  755. args.emplace_back(
  756. TensorLayout({m, n}, {2, k2}, dtype::Int8()),
  757. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int8()));
  758. args.emplace_back(
  759. TensorLayout({m, n}, {k2, 2}, dtype::Int8()),
  760. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int8()));
  761. args.emplace_back(
  762. TensorLayout({m, n}, {k2, 1}, dtype::Int8()),
  763. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int8()));
  764. args.emplace_back(
  765. TensorLayout({m, n}, {1, k2}, dtype::Int8()),
  766. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int8()));
  767. args.emplace_back(
  768. TensorLayout({m, n}, {1, k2}, dtype::Int8()),
  769. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int8()));
  770. args.emplace_back(
  771. TensorLayout({m, n}, {k2, 1}, dtype::Int8()),
  772. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int8()));
  773. }
  774. {
  775. // 3d
  776. size_t m = 20, n = 30, k = 40;
  777. ptrdiff_t k2 = k;
  778. args.emplace_back(
  779. TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2}, dtype::Int8()),
  780. TensorLayout(
  781. {m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2}, dtype::Int8()));
  782. }
  783. {
  784. // simplify_layout
  785. // 234..56
  786. // 2..3456
  787. args.emplace_back(
  788. TensorLayout(
  789. {2, 3, 4, 5, 6},
  790. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  791. dtype::Int8()),
  792. TensorLayout(
  793. {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  794. dtype::Int8()));
  795. args.emplace_back(
  796. TensorLayout(
  797. {2, 3, 4, 5, 6},
  798. {4 * 3 * 4 * 5 * 6, 4 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  799. dtype::Int8()),
  800. TensorLayout(
  801. {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  802. dtype::Int8()));
  803. }
  804. Checker<Relayout> checker(handle_cuda());
  805. for (auto&& arg : args) {
  806. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  807. }
  808. }
  809. TEST_F(CUDA, RELAYOUT_TEST) {
  810. struct Arg {
  811. TensorLayout src, dst;
  812. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  813. };
  814. std::vector<Arg> args;
  815. //! dst contig
  816. args.emplace_back(
  817. TensorLayout({5, 32, 9}, {288, 1, 32}, dtype::Int8()),
  818. TensorLayout({5, 9, 32}, {288, 32, 1}, dtype::Int8()));
  819. args.emplace_back(
  820. TensorLayout({5, 9, 32}, {288, 1, 9}, dtype::Int8()),
  821. TensorLayout({5, 32, 9}, {288, 9, 1}, dtype::Int8()));
  822. args.emplace_back(
  823. TensorLayout({5, 4, 9}, {36, 1, 4}, dtype::Int8()),
  824. TensorLayout({5, 9, 4}, {36, 4, 1}, dtype::Int8()));
  825. args.emplace_back(
  826. TensorLayout({5, 9, 4}, {36, 1, 9}, dtype::Int8()),
  827. TensorLayout({5, 4, 9}, {36, 9, 1}, dtype::Int8()));
  828. args.emplace_back(
  829. TensorLayout({5, 32, 4}, {128, 1, 32}, dtype::Int8()),
  830. TensorLayout({5, 4, 32}, {128, 32, 1}, dtype::Int8()));
  831. args.emplace_back(
  832. TensorLayout({5, 4, 32}, {128, 1, 4}, dtype::Int8()),
  833. TensorLayout({5, 32, 4}, {128, 4, 1}, dtype::Int8()));
  834. args.emplace_back(
  835. TensorLayout({5, 7, 5}, {35, 1, 7}, dtype::Int8()),
  836. TensorLayout({5, 5, 7}, {35, 7, 1}, dtype::Int8()));
  837. args.emplace_back(
  838. TensorLayout({5, 5, 7}, {35, 1, 5}, dtype::Int8()),
  839. TensorLayout({5, 7, 5}, {35, 5, 1}, dtype::Int8()));
  840. //! src contig
  841. args.emplace_back(
  842. TensorLayout({5, 9, 32}, {288, 32, 1}, dtype::Int8()),
  843. TensorLayout({5, 32, 9}, {288, 1, 32}, dtype::Int8()));
  844. args.emplace_back(
  845. TensorLayout({5, 32, 9}, {288, 9, 1}, dtype::Int8()),
  846. TensorLayout({5, 9, 32}, {288, 1, 9}, dtype::Int8()));
  847. args.emplace_back(
  848. TensorLayout({5, 9, 4}, {36, 4, 1}, dtype::Int8()),
  849. TensorLayout({5, 4, 9}, {36, 1, 4}, dtype::Int8()));
  850. args.emplace_back(
  851. TensorLayout({5, 4, 9}, {36, 9, 1}, dtype::Int8()),
  852. TensorLayout({5, 9, 4}, {36, 1, 9}, dtype::Int8()));
  853. args.emplace_back(
  854. TensorLayout({5, 4, 32}, {128, 32, 1}, dtype::Int8()),
  855. TensorLayout({5, 32, 4}, {128, 1, 32}, dtype::Int8()));
  856. args.emplace_back(
  857. TensorLayout({5, 32, 4}, {128, 4, 1}, dtype::Int8()),
  858. TensorLayout({5, 4, 32}, {128, 1, 4}, dtype::Int8()));
  859. args.emplace_back(
  860. TensorLayout({5, 5, 7}, {35, 7, 1}, dtype::Int8()),
  861. TensorLayout({5, 7, 5}, {35, 1, 7}, dtype::Int8()));
  862. args.emplace_back(
  863. TensorLayout({5, 7, 5}, {35, 5, 1}, dtype::Int8()),
  864. TensorLayout({5, 5, 7}, {35, 1, 5}, dtype::Int8()));
  865. //! cross
  866. args.emplace_back(
  867. TensorLayout({5, 9, 32}, {288 * 4, 32 * 3, 1}, dtype::Int8()),
  868. TensorLayout({5, 32, 9}, {288 * 4, 1, 32 * 3}, dtype::Int8()));
  869. args.emplace_back(
  870. TensorLayout({5, 32, 9}, {288 * 3, 9 * 2, 1}, dtype::Int8()),
  871. TensorLayout({5, 9, 32}, {288 * 3, 1, 9 * 2}, dtype::Int8()));
  872. args.emplace_back(
  873. TensorLayout({5, 9, 4}, {36 * 10, 4 * 7, 1}, dtype::Int8()),
  874. TensorLayout({5, 4, 9}, {36 * 10, 1, 4 * 7}, dtype::Int8()));
  875. Checker<Relayout> checker(handle_cuda());
  876. for (auto&& arg : args) {
  877. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  878. }
  879. }
  880. TEST_F(CUDA, RELAYOUT_Q4) {
  881. Checker<Relayout> checker(handle_cuda());
  882. UniformIntRNG rng_int4{-7, 7};
  883. checker.set_rng(0, &rng_int4)
  884. .set_rng(1, &rng_int4)
  885. .set_dtype(0, dtype::QuantizedS4(1.f))
  886. .set_dtype(1, dtype::QuantizedS4(1.f))
  887. .execs({{2, 2, 1, 1}, {1, 1, 2, 2}})
  888. .execs({{1, 64, 15, 15}, {1, 15, 15, 64}})
  889. .execs({{1, 5, 9, 32}, {1, 5, 32, 9}})
  890. .execl(TensorLayoutArray{
  891. {{6400}, {1}, dtype::QuantizedS4{1.f}},
  892. {{20, 320}, {1024, 1}, dtype::QuantizedS4{1.f}}})
  893. .execl(TensorLayoutArray{
  894. {{1200, 3}, {4, 1}, dtype::QuantizedS4{1.f}},
  895. {{20, 60, 3}, {256, 4, 1}, dtype::QuantizedS4{1.f}}})
  896. .execl(TensorLayoutArray{
  897. {{20, 20, 3, 3}, {256, 12, 4, 1}, dtype::QuantizedS4{1.f}},
  898. {{1200, 3}, {4, 1}, dtype::QuantizedS4{1.f}}})
  899. .execl(TensorLayoutArray{
  900. {{5, 16, 7, 7, 4}, {3136, 196, 28, 4, 1}, dtype::QuantizedS4{1.f}},
  901. {{5, 16, 7, 7, 4}, {3136, 4, 448, 64, 1}, dtype::QuantizedS4{1.f}}})
  902. .execl(TensorLayoutArray{
  903. {{5, 7, 7, 16, 4}, {3136, 448, 64, 4, 1}, dtype::QuantizedS4{1.f}},
  904. {{5, 7, 7, 16, 4}, {3136, 28, 4, 196, 1}, dtype::QuantizedS4{1.f}}})
  905. .execl(TensorLayoutArray{
  906. {{5, 2, 7, 7, 32},
  907. {3136, 1568, 224, 32, 1},
  908. dtype::QuantizedS4{1.f}},
  909. {{5, 2, 7, 7, 32},
  910. {3136, 32, 448, 64, 1},
  911. dtype::QuantizedS4{1.f}}})
  912. .execl(TensorLayoutArray{
  913. {{5, 7, 7, 2, 32}, {3136, 448, 64, 32, 1}, dtype::QuantizedS4{1.f}},
  914. {{5, 7, 7, 2, 32},
  915. {3136, 224, 32, 1568, 1},
  916. dtype::QuantizedS4{1.f}}});
  917. }
  918. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台