You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout.cpp 38 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920
  1. /**
  2. * \file dnn/test/cuda/relayout.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "test/common/relayout.h"
  12. #include "megdnn/oprs.h"
  13. #include "test/common/checker.h"
  14. #include "test/cuda/benchmark.h"
  15. #include "test/cuda/fixture.h"
  16. #include "test/common/rng.h"
  17. using namespace megdnn;
  18. using namespace test;
  19. namespace {
  20. template <typename tag>
  21. class CUDA_RELAYOUT : public CUDA {};
  22. TYPED_TEST_CASE(CUDA_RELAYOUT, relayout::test_types);
  23. TYPED_TEST(CUDA_RELAYOUT, run) {
  24. relayout::run_test<TypeParam>(this->handle_cuda());
  25. }
  26. } // namespace
  27. TEST_F(CUDA, RELAYOUT_TRANSPOSE) {
  28. Checker<Relayout> checker(handle_cuda());
  29. auto run = [&](size_t batch, size_t m, size_t n, size_t c, DType dtype) {
  30. checker.set_dtype(0, dtype).set_dtype(1, dtype);
  31. TensorLayout src = {{batch, m, n, c}, dtype};
  32. src.init_contiguous_stride();
  33. TensorLayout dst = {{batch, m, n, c}, dtype};
  34. dst.stride[0] = m * n * c;
  35. dst.stride[1] = c;
  36. dst.stride[2] = m * c;
  37. dst.stride[3] = 1;
  38. checker.execl({src, dst});
  39. };
  40. run(16, 30, 40, 4, dtype::Int8());
  41. run(16, 20, 10, 4, dtype::Int8());
  42. run(1, 30, 20, 1, dtype::Int32());
  43. run(1, 20, 30, 1, dtype::Int32());
  44. run(1, 11, 21, 1, dtype::Float32());
  45. }
  46. #if MEGDNN_WITH_BENCHMARK
  47. TEST_F(CUDA, BENCHMARK_RELAYOUT_TRANSPOSE) {
  48. static constexpr size_t RUNS = 1000;
  49. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  50. benchmarker.set_times(RUNS);
  51. auto run = [&](size_t batch, size_t m, size_t n, size_t c, DType dtype) {
  52. benchmarker.set_dtype(0, dtype).set_dtype(1, dtype);
  53. TensorLayout src = {{batch, m, n, c}, dtype};
  54. src.init_contiguous_stride();
  55. TensorLayout dst = {{batch, m, n, c}, dtype};
  56. dst.stride[0] = m * n * c;
  57. dst.stride[1] = c;
  58. dst.stride[2] = m * c;
  59. dst.stride[3] = 1;
  60. auto time_ms =
  61. benchmarker.execl({src, dst}) / RUNS;
  62. printf("{%zux%zux%zux%zu}->{%zux%zux%zux%zu} bandwidth: %.2f gbps\n",
  63. batch, m, n, c, batch, n, m, c,
  64. 2.f * batch * m * n * c * dtype.size() / (1e6 * time_ms));
  65. };
  66. run(16, 640, 480, 4, dtype::Int8());
  67. run(256, 224, 224, 4, dtype::Int8());
  68. run(1, 256, 224 * 224, 1, dtype::Int32());
  69. run(1, 256, 7 * 7 * 512, 1, dtype::Int32());
  70. run(1, 4096, 4096, 1, dtype::Float32());
  71. }
  72. TEST_F(CUDA, BENCHMARK_RELAYOUT) {
  73. //! benchmark contious layout, such as (a, b, c, d) -> (b, a, c,d)
  74. //! just change the first two axis
  75. static constexpr size_t RUNS = 3;
  76. auto run = [&](const TensorLayoutArray& layouts) {
  77. Benchmarker<Relayout> benchmarker(handle_cuda());
  78. benchmarker.set_times(RUNS);
  79. for (auto&& layout : layouts) {
  80. TensorLayout src = layout.dimshuffle({1, 0, 2});
  81. TensorLayout dst = layout;
  82. std::swap(dst.shape[0], dst.shape[1]);
  83. dst.init_contiguous_stride();
  84. auto used = benchmarker.execl({src, dst});
  85. printf("layout: %s bandwith: %f gbps/s\n",
  86. layout.to_string().c_str(),
  87. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS /
  88. used * 1000 / (1024 * 1024 * 1024));
  89. }
  90. };
  91. TensorLayoutArray layouts = {
  92. {{12, 23, 2}, dtype::Int32()},
  93. {{12, 23, 8}, dtype::Int32()},
  94. {{12, 23, 17}, dtype::Int32()},
  95. {{12, 23, 64}, dtype::Int32()},
  96. {{12, 23, 129}, dtype::Int32()},
  97. {{12, 23, 256}, dtype::Int32()},
  98. {{12, 23, 1029}, dtype::Int32()},
  99. {{12, 23, 4096}, dtype::Int32()},
  100. {{12, 23, 9143}, dtype::Int32()},
  101. {{12, 23, 18284}, dtype::Int32()},
  102. {{2, 2, 1000000}, dtype::Int32()},
  103. };
  104. run(layouts);
  105. auto run2 = [&](const TensorLayoutArray& layouts) {
  106. Benchmarker<Relayout> benchmarker(handle_cuda());
  107. benchmarker.set_times(RUNS);
  108. for (auto&& layout : layouts) {
  109. TensorLayout src = layout.dimshuffle({0, 2, 1, 3});
  110. TensorLayout dst = layout;
  111. std::swap(dst.shape[0], dst.shape[1]);
  112. dst.init_contiguous_stride();
  113. auto used = benchmarker.execl({src, dst});
  114. printf("layout: %s bandwith: %f gbps/s\n",
  115. layout.to_string().c_str(),
  116. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS /
  117. used * 1000 / (1024 * 1024 * 1024));
  118. }
  119. };
  120. layouts = {
  121. {{3, 12, 24, 100}, dtype::Int32()},
  122. {{3, 12, 24, 1029}, dtype::Int32()},
  123. {{3, 4, 24, 9143}, dtype::Int32()},
  124. {{3, 4, 24, 18284}, dtype::Int32()},
  125. };
  126. run2(layouts);
  127. }
  128. TEST_F(CUDA, BENCHMARK_RELAYOUT_LAST_CONTIG) {
  129. //! src and dst are all get subtensor in channel axis
  130. static constexpr size_t RUNS = 3;
  131. Benchmarker<Relayout> benchmarker(handle_cuda());
  132. benchmarker.set_times(RUNS);
  133. TensorLayout src =
  134. TensorLayout({5, 5, 100000}, {800000, 100000, 1}, dtype::Float32());
  135. TensorLayout dst =
  136. TensorLayout({5, 5, 100000}, {700000, 100000, 1}, dtype::Float32());
  137. auto used = benchmarker.execl({src, dst});
  138. printf("src: %s dst: %s bandwith: %f gbps/s\n", src.to_string().c_str(),
  139. dst.to_string().c_str(),
  140. 2 * src.total_nr_elems() * src.dtype.size() * RUNS / used * 1000 /
  141. (1024 * 1024 * 1024));
  142. }
  143. TEST_F(CUDA, BENCHMARK_RELAYOUT_LAST_NOT_CONTIG) {
  144. static constexpr size_t RUNS = 3;
  145. auto run = [&](TensorLayout src, TensorLayout dst) {
  146. Benchmarker<Relayout> benchmarker(handle_cuda());
  147. auto&& layout = src;
  148. benchmarker.set_times(RUNS);
  149. dst.init_contiguous_stride();
  150. auto used = benchmarker.execl({src, dst});
  151. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  152. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  153. 1000 / (1024 * 1024 * 1024));
  154. };
  155. run({{16, 128, 128}, {49152, 384, 3}, dtype::Float32()},
  156. {{16, 128, 128}, {16384, 128, 1}, dtype::Float32()});
  157. }
  158. TEST_F(CUDA, BENCHMARK_RELAYOUT_6) {
  159. static constexpr size_t RUNS = 3;
  160. auto run = [&](TensorLayoutArray layouts,
  161. std::vector<std::vector<size_t>> permutations) {
  162. Benchmarker<Relayout> benchmarker(handle_cuda());
  163. benchmarker.set_times(RUNS);
  164. int i = 0;
  165. for (auto&& layout : layouts) {
  166. auto per = permutations[i];
  167. TensorLayout src = layout.dimshuffle(per);
  168. TensorLayout dst = layout;
  169. std::swap(dst.shape[0], dst.shape[1]);
  170. dst.init_contiguous_stride();
  171. auto used = benchmarker.execl({src, dst});
  172. Checker<Relayout> checker(handle_cuda());
  173. checker.exec(TensorLayoutArray{src, dst});
  174. printf("layout: %s bandwith: %f gbps/s\n",
  175. layout.to_string().c_str(),
  176. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS /
  177. used * 1000 / (1024 * 1024 * 1024));
  178. i++;
  179. }
  180. };
  181. TensorLayoutArray layouts = {
  182. {{7248, 7248}, dtype::Int32()},
  183. {{43408, 1216}, dtype::Int32()},
  184. {{1216, 43408}, dtype::Int32()},
  185. {{368, 384, 384}, dtype::Int32()},
  186. {{2144, 64, 384}, dtype::Int32()},
  187. {{368, 64, 2307}, dtype::Int32()},
  188. {{384, 384, 355}, dtype::Int32()},
  189. {{2320, 384, 59}, dtype::Int32()},
  190. {{384, 2320, 59}, dtype::Int32()},
  191. {{384, 355, 384}, dtype::Int32()},
  192. {{2320, 59, 384}, dtype::Int32()},
  193. {{384, 59, 2320}, dtype::Int32()},
  194. {{80, 96, 75, 96}, dtype::Int32()},
  195. {{464, 16, 75, 96}, dtype::Int32()},
  196. {{80, 16, 75, 582}, dtype::Int32()},
  197. {{96, 75, 96, 75}, dtype::Int32()},
  198. {{608, 12, 96, 75}, dtype::Int32()},
  199. {{96, 12, 608, 75}, dtype::Int32()},
  200. {{96, 75, 96, 75}, dtype::Int32()},
  201. {{608, 12, 96, 75}, dtype::Int32()},
  202. {{96, 12, 608, 75}, dtype::Int32()},
  203. {{96, 96, 75, 75}, dtype::Int32()},
  204. {{608, 96, 12, 75}, dtype::Int32()},
  205. {{96, 608, 12, 75}, dtype::Int32()},
  206. {{96, 75, 75, 96}, dtype::Int32()},
  207. {{608, 12, 75, 96}, dtype::Int32()},
  208. {{96, 12, 75, 608}, dtype::Int32()},
  209. {{32, 48, 28, 28, 48}, dtype::Int32()},
  210. {{176, 8, 28, 28, 48}, dtype::Int32()},
  211. {{32, 8, 28, 28, 298}, dtype::Int32()},
  212. {{48, 28, 28, 48, 28}, dtype::Int32()},
  213. {{352, 4, 28, 48, 28}, dtype::Int32()},
  214. {{48, 4, 28, 352, 28}, dtype::Int32()},
  215. {{48, 28, 48, 28, 28}, dtype::Int32()},
  216. {{352, 4, 48, 28, 28}, dtype::Int32()},
  217. {{48, 4, 352, 28, 28}, dtype::Int32()},
  218. {{48, 48, 28, 28, 28}, dtype::Int32()},
  219. {{352, 48, 4, 28, 28}, dtype::Int32()},
  220. {{48, 352, 4, 28, 28}, dtype::Int32()},
  221. {{48, 28, 28, 28, 48}, dtype::Int32()},
  222. {{352, 4, 28, 28, 48}, dtype::Int32()},
  223. {{48, 4, 28, 28, 352}, dtype::Int32()},
  224. {{16, 32, 15, 32, 15, 15}, dtype::Int32()},
  225. {{48, 10, 15, 32, 15, 15}, dtype::Int32()},
  226. {{16, 10, 15, 103, 15, 15}, dtype::Int32()},
  227. {{32, 15, 15, 32, 15, 15}, dtype::Int32()},
  228. {{112, 5, 15, 32, 15, 15}, dtype::Int32()},
  229. {{32, 5, 15, 112, 15, 15}, dtype::Int32()},
  230. {{32, 15, 32, 15, 15, 15}, dtype::Int32()},
  231. {{112, 5, 32, 15, 15, 15}, dtype::Int32()},
  232. {{32, 5, 112, 15, 15, 15}, dtype::Int32()},
  233. {{32, 15, 15, 32, 15, 15}, dtype::Int32()},
  234. {{112, 5, 15, 32, 15, 15}, dtype::Int32()},
  235. {{32, 5, 15, 112, 15, 15}, dtype::Int32()},
  236. {{32, 15, 15, 15, 15, 32}, dtype::Int32()},
  237. {{112, 5, 15, 15, 15, 32}, dtype::Int32()},
  238. {{32, 5, 15, 15, 15, 112}, dtype::Int32()},
  239. };
  240. std::vector<std::vector<size_t>> permutations = {
  241. std::vector<size_t>{1, 0},
  242. std::vector<size_t>{1, 0},
  243. std::vector<size_t>{1, 0},
  244. std::vector<size_t>{0, 2, 1},
  245. std::vector<size_t>{0, 2, 1},
  246. std::vector<size_t>{0, 2, 1},
  247. std::vector<size_t>{1, 0, 2},
  248. std::vector<size_t>{1, 0, 2},
  249. std::vector<size_t>{1, 0, 2},
  250. std::vector<size_t>{2, 1, 0},
  251. std::vector<size_t>{2, 1, 0},
  252. std::vector<size_t>{2, 1, 0},
  253. std::vector<size_t>{0, 3, 2, 1},
  254. std::vector<size_t>{0, 3, 2, 1},
  255. std::vector<size_t>{0, 3, 2, 1},
  256. std::vector<size_t>{2, 1, 3, 0},
  257. std::vector<size_t>{2, 1, 3, 0},
  258. std::vector<size_t>{2, 1, 3, 0},
  259. std::vector<size_t>{2, 0, 3, 1},
  260. std::vector<size_t>{2, 0, 3, 1},
  261. std::vector<size_t>{2, 0, 3, 1},
  262. std::vector<size_t>{1, 0, 3, 2},
  263. std::vector<size_t>{1, 0, 3, 2},
  264. std::vector<size_t>{1, 0, 3, 2},
  265. std::vector<size_t>{3, 2, 1, 0},
  266. std::vector<size_t>{3, 2, 1, 0},
  267. std::vector<size_t>{3, 2, 1, 0},
  268. std::vector<size_t>{0, 4, 2, 1, 3},
  269. std::vector<size_t>{0, 4, 2, 1, 3},
  270. std::vector<size_t>{0, 4, 2, 1, 3},
  271. std::vector<size_t>{3, 2, 1, 4, 0},
  272. std::vector<size_t>{3, 2, 1, 4, 0},
  273. std::vector<size_t>{3, 2, 1, 4, 0},
  274. std::vector<size_t>{2, 0, 4, 1, 3},
  275. std::vector<size_t>{2, 0, 4, 1, 3},
  276. std::vector<size_t>{2, 0, 4, 1, 3},
  277. std::vector<size_t>{1, 3, 0, 4, 2},
  278. std::vector<size_t>{1, 3, 0, 4, 2},
  279. std::vector<size_t>{1, 3, 0, 4, 2},
  280. std::vector<size_t>{4, 3, 2, 1, 0},
  281. std::vector<size_t>{4, 3, 2, 1, 0},
  282. std::vector<size_t>{4, 3, 2, 1, 0},
  283. std::vector<size_t>{0, 3, 2, 5, 4, 1},
  284. std::vector<size_t>{0, 3, 2, 5, 4, 1},
  285. std::vector<size_t>{0, 3, 2, 5, 4, 1},
  286. std::vector<size_t>{3, 2, 0, 5, 1, 4},
  287. std::vector<size_t>{3, 2, 0, 5, 1, 4},
  288. std::vector<size_t>{3, 2, 0, 5, 1, 4},
  289. std::vector<size_t>{2, 0, 4, 1, 5, 3},
  290. std::vector<size_t>{2, 0, 4, 1, 5, 3},
  291. std::vector<size_t>{2, 0, 4, 1, 5, 3},
  292. std::vector<size_t>{3, 2, 5, 1, 0, 4},
  293. std::vector<size_t>{3, 2, 5, 1, 0, 4},
  294. std::vector<size_t>{3, 2, 5, 1, 0, 4},
  295. std::vector<size_t>{5, 4, 3, 2, 1, 0},
  296. std::vector<size_t>{5, 4, 3, 2, 1, 0},
  297. std::vector<size_t>{5, 4, 3, 2, 1, 0}};
  298. run(layouts, permutations);
  299. }
  300. TEST_F(CUDA, BENCHMARK_RELAYOUT_7) {
  301. static constexpr size_t RUNS = 3;
  302. auto isTrivial = [&](std::vector<size_t>& permutation) {
  303. for (size_t i = 0; i < permutation.size(); i++) {
  304. if (permutation[i] != i)
  305. return false;
  306. }
  307. return true;
  308. };
  309. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  310. Benchmarker<Relayout> benchmarker(handle_cuda());
  311. benchmarker.set_times(RUNS);
  312. TensorLayout src = layout.dimshuffle(per);
  313. TensorLayout dst = layout;
  314. std::swap(dst.shape[0], dst.shape[1]);
  315. dst.init_contiguous_stride();
  316. auto used = benchmarker.execl({src, dst});
  317. Checker<Relayout> checker(handle_cuda());
  318. checker.exec(TensorLayoutArray{src, dst});
  319. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  320. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  321. 1000 / (1024 * 1024 * 1024));
  322. };
  323. std::vector<size_t> _dim = {5, 3, 2, 4, 35, 33, 37};
  324. std::vector<size_t> permutation(7);
  325. // Inverse
  326. for (size_t r = 0; r < _dim.size(); r++) {
  327. size_t size = _dim.size();
  328. permutation[r] = size - 1 - r;
  329. }
  330. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5], _dim[6]},
  331. dtype::Int32()},
  332. permutation);
  333. // Random
  334. for (size_t r = 0; r < _dim.size(); r++)
  335. permutation[r] = r;
  336. for (int nsample = 0; nsample < 50; nsample++) {
  337. COMPAT_RANDOM(_dim.begin(), _dim.end());
  338. COMPAT_RANDOM(permutation.begin(), permutation.end());
  339. if (!isTrivial(permutation)) {
  340. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5],
  341. _dim[6]},
  342. dtype::Int32()},
  343. permutation);
  344. }
  345. }
  346. }
  347. TEST_F(CUDA, BENCHMARK_RELAYOUT_5) {
  348. static constexpr size_t RUNS = 10;
  349. auto isTrivial = [&](std::vector<size_t>& permutation) {
  350. for (size_t i = 0; i < permutation.size(); i++) {
  351. if (permutation[i] != i)
  352. return false;
  353. }
  354. return true;
  355. };
  356. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  357. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  358. benchmarker.set_times(RUNS);
  359. TensorLayout src = layout.dimshuffle(per);
  360. TensorLayout dst = layout;
  361. // std::swap(dst.shape[0], dst.shape[1]);
  362. dst.init_contiguous_stride();
  363. auto used = benchmarker.execl({src, dst});
  364. Checker<Relayout> checker(handle_cuda());
  365. checker.exec(TensorLayoutArray{src, dst});
  366. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  367. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  368. 1000 / (1024 * 1024 * 1024));
  369. };
  370. size_t two = 2;
  371. int ratio = 5;
  372. int numElemAvg = 1000000 * 200;
  373. UniformFloatRNG numElem_dist((double)numElemAvg, (double)numElemAvg*0.2);
  374. for (int rank = 5; rank <= 5; rank++) {
  375. for (int iter = 0; iter < 20; iter++) {
  376. int numElem = (int)numElem_dist.gen_single_val();
  377. std::vector<size_t> dim(rank);
  378. std::vector<size_t> permutation(rank);
  379. std::vector<double> dimf(rank);
  380. double volf = 1.0;
  381. for (int r = 0; r < rank; r++) {
  382. permutation[r] = (size_t)r;
  383. dimf[r] = 1.0 + (double)r * (ratio - 1.0) / (double)(rank - 1);
  384. volf *= dimf[r];
  385. }
  386. // fprintf(stderr, "volf %lf\n", volf);
  387. double scale = pow((double)numElem / volf, 1.0 / (double)rank);
  388. // fprintf(stderr, "scale %lf\n", scale);
  389. int vol = 1;
  390. for (int r = 0; r < rank; r++) {
  391. if (r == rank - 1) {
  392. dim[r] = ratio * dim[0];
  393. } else {
  394. dim[r] = (size_t)round(dimf[r] * scale);
  395. }
  396. dim[r] = std::max(two, dim[r]);
  397. vol *= dim[r];
  398. }
  399. // fprintf(stderr, "dim[0] %lf\n", dim[0]);
  400. double cur_ratio = (double)dim[rank - 1] / (double)dim[0];
  401. double vol_re = fabs((double)(vol - numElem) / (double)numElem);
  402. // Fix dimensions if volume is off by more than 5%
  403. if (vol_re > 0.05) {
  404. size_t d = (vol < numElem) ? 1 : -1;
  405. int r = 1;
  406. while (vol_re > 0.05 && r < rank) {
  407. size_t dim_plus_d = std::max(two, dim[r] + d);
  408. vol = (vol / dim[r]) * dim_plus_d;
  409. dim[r] = dim_plus_d;
  410. vol_re = fabs((double)(vol - numElem) / (double)numElem);
  411. r++;
  412. }
  413. }
  414. size_t minDim = *(std::min_element(dim.begin(), dim.end()));
  415. size_t maxDim = *(std::max_element(dim.begin(), dim.end()));
  416. cur_ratio = (double)maxDim / (double)minDim;
  417. printf("vol %d cur_ratio %lf | %lf\n", vol, cur_ratio, vol_re);
  418. // printVec(dim);
  419. COMPAT_RANDOM(dim.begin(), dim.end());
  420. while (isTrivial(permutation)) {
  421. COMPAT_RANDOM(permutation.begin(), permutation.end());
  422. }
  423. run({{dim[0], dim[1], dim[2], dim[3], dim[4]}, dtype::Int32()},
  424. permutation);
  425. // if (!bench_tensor<T>(dim, permutation)) return false;
  426. }
  427. }
  428. }
  429. TEST_F(CUDA, BENCHMARK_RELAYOUT_NCHW_NCHW4) {
  430. static constexpr size_t RUNS = 10;
  431. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  432. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  433. benchmarker.set_times(RUNS);
  434. TensorLayout src = layout.dimshuffle(per);
  435. TensorLayout dst = layout;
  436. dst.init_contiguous_stride();
  437. auto used = benchmarker.execl({src, dst});
  438. Checker<Relayout> checker(handle_cuda());
  439. checker.exec(TensorLayoutArray{src, dst});
  440. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  441. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  442. 1000 / (1024 * 1024 * 1024));
  443. };
  444. UniformIntRNG u(2,100);
  445. printf("NCHW->NCHW4\n");
  446. for (int i = 0; i < 20; i++) {
  447. int d1 = u.gen_single_val();
  448. int d2 = (u.gen_single_val() / 4 + 1) * 4;
  449. int d3 = 4;
  450. // int d4=(u.gen_single_val()/4+1)*4;
  451. int d4 = (u.gen_single_val());
  452. int d5 = (u.gen_single_val());
  453. // int d5=(u.gen_single_val()/4+1)*4;
  454. // int d5 = (u.gen_single_val())*2+1;
  455. run({{(size_t)d1, (size_t)d2 / 4, (size_t)d3, (size_t)d4, (size_t)d5},
  456. {d2 * d3 * d4 * d5 / 4, d3 * d4 * d5, d4 * d5, d5, 1},
  457. dtype::Int8()},
  458. {0, 1, 3, 4, 2});
  459. }
  460. printf("\n\nNCHW4->NCHW\n");
  461. for (int i = 0; i < 20; i++) {
  462. int d1 = u.gen_single_val();
  463. int d2 = (u.gen_single_val() / 4 + 1) * 4;
  464. int d3 = u.gen_single_val();
  465. // int d5=(u.gen_single_val()/4+1)*4;
  466. int d4 = u.gen_single_val();
  467. int d5 = 4;
  468. run({{(size_t)d1, (size_t)d2 / 4, (size_t)d3, (size_t)d4, (size_t)d5},
  469. {d2 * d3 * d4 * d5 / 4, d3 * d4 * d5, d4 * d5, d5, 1},
  470. dtype::Int8()},
  471. {0, 1, 4, 2, 3});
  472. }
  473. }
  474. TEST_F(CUDA, BENCHMARK_RELAYOUT_NCHW4_NCHW32) {
  475. static constexpr size_t RUNS = 10;
  476. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  477. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  478. benchmarker.set_times(RUNS);
  479. TensorLayout src = layout.dimshuffle(per);
  480. TensorLayout dst = layout;
  481. dst.init_contiguous_stride();
  482. auto used = benchmarker.execl({src, dst});
  483. Checker<Relayout> checker(handle_cuda());
  484. checker.exec(TensorLayoutArray{src, dst});
  485. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  486. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  487. 1000 / (1024 * 1024 * 1024));
  488. };
  489. UniformIntRNG u(4,50);
  490. printf("NCHW4 to NCHW32\n");
  491. for (int i = 0; i < 20; i++) {
  492. int d1 = u.gen_single_val();
  493. int d2 = (u.gen_single_val() / 8 + 1) * 8;
  494. int d3 = 8;
  495. int d4 = u.gen_single_val();
  496. int d5 = u.gen_single_val();
  497. int d6 = 4;
  498. run({{(size_t)d1, (size_t)d2 / 8, (size_t)d3, (size_t)d4, (size_t)d5,
  499. (size_t)d6},
  500. {d2 * d3 * d4 * d5 * d6 / 8, d3 * d4 * d5 * d6, d4 * d5 * d6,
  501. d5 * d6, d6, 1},
  502. dtype::Int8()},
  503. {0, 1, 3, 4, 2, 5});
  504. }
  505. printf("\n\nNCHW32 to NCHW4\n");
  506. for (int i = 0; i < 20; i++) {
  507. int d1 = u.gen_single_val();
  508. int d2 = (u.gen_single_val() / 8 + 1) * 8;
  509. int d3 = u.gen_single_val();
  510. int d4 = u.gen_single_val();
  511. int d5 = 8;
  512. int d6 = 4;
  513. run({{(size_t)d1, (size_t)d2 / 8, (size_t)d3, (size_t)d4, (size_t)d5,
  514. (size_t)d6},
  515. {d2 * d3 * d4 * d5 * d6 / 8, d3 * d4 * d5 * d6, d4 * d5 * d6,
  516. d5 * d6, d6, 1},
  517. dtype::Int8()},
  518. {0, 1, 4, 2, 3, 5});
  519. }
  520. }
  521. TEST_F(CUDA, BENCHMARK_LAST_CONTIG_ALIGN_TEST) {
  522. static constexpr size_t RUNS = 10;
  523. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  524. CUBenchmarker<Relayout> benchmarker(handle_cuda());
  525. benchmarker.set_times(RUNS);
  526. TensorLayout src = layout.dimshuffle(per);
  527. TensorLayout dst = layout;
  528. // std::swap(dst.shape[0], dst.shape[1]);
  529. dst.init_contiguous_stride();
  530. auto used = benchmarker.execl({src, dst});
  531. Checker<Relayout> checker(handle_cuda());
  532. checker.exec(TensorLayoutArray{src, dst});
  533. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  534. 2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used *
  535. 1000 / (1024 * 1024 * 1024));
  536. };
  537. UniformIntRNG u(4,50);
  538. std::vector<size_t> _dim(6);
  539. std::vector<size_t> permutation(_dim.size());
  540. for (size_t r = 0; r < _dim.size(); r++) {
  541. size_t size = _dim.size();
  542. permutation[r] = size - 1 - r;
  543. }
  544. _dim[0] = u.gen_single_val();
  545. _dim[1] = u.gen_single_val();
  546. _dim[2] = u.gen_single_val();
  547. _dim[3] = u.gen_single_val();
  548. _dim[4] = u.gen_single_val();
  549. _dim[5] = (u.gen_single_val() / 4 + 1) * 4;
  550. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5]}, dtype::Int8()},
  551. permutation);
  552. // Random
  553. for (size_t r = 0; r < _dim.size(); r++)
  554. permutation[r] = r;
  555. for (int nsample = 0; nsample < 20; nsample++) {
  556. COMPAT_RANDOM(_dim.begin(), _dim.end() - 1);
  557. COMPAT_RANDOM(permutation.begin(), permutation.end() - 1);
  558. if (nsample < 5)
  559. _dim[5] = (u.gen_single_val() / 4 + 1) * 4;
  560. else
  561. _dim[5] = u.gen_single_val();
  562. run({{_dim[0], _dim[1], _dim[2], _dim[3], _dim[4], _dim[5]},
  563. dtype::Int8()},
  564. permutation);
  565. }
  566. }
  567. #endif
  568. TEST_F(CUDA, RELAYOUT) {
  569. struct Arg {
  570. TensorLayout src, dst;
  571. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  572. };
  573. std::vector<Arg> args;
  574. {
  575. // contiguous stride
  576. args.emplace_back(TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()),
  577. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()));
  578. args.emplace_back(TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()),
  579. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()));
  580. args.emplace_back(
  581. TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Float16()),
  582. TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Float16()));
  583. }
  584. args.emplace_back(
  585. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()),
  586. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()));
  587. args.emplace_back(
  588. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  589. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()));
  590. args.emplace_back(
  591. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  592. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Float16()));
  593. args.emplace_back(
  594. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()),
  595. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()));
  596. args.emplace_back(
  597. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  598. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()));
  599. args.emplace_back(
  600. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  601. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int32()));
  602. args.emplace_back(
  603. TensorLayout({16, 128, 128}, {49152, 384, 3}, dtype::Float32()),
  604. TensorLayout({16, 128, 128}, {16384, 128, 1}, dtype::Float32()));
  605. {
  606. // 1d
  607. size_t n = 10000;
  608. args.emplace_back(TensorLayout({n}, {1}, dtype::Int32()),
  609. TensorLayout({n}, {1}, dtype::Int32()));
  610. args.emplace_back(TensorLayout({n}, {1}, dtype::Int32()),
  611. TensorLayout({n}, {2}, dtype::Int32()));
  612. args.emplace_back(TensorLayout({n}, {2}, dtype::Int32()),
  613. TensorLayout({n}, {1}, dtype::Int32()));
  614. args.emplace_back(TensorLayout({n}, {2}, dtype::Int32()),
  615. TensorLayout({n}, {2}, dtype::Int32()));
  616. }
  617. {
  618. // 2d
  619. size_t m = 200, n = 300, k = 400;
  620. ptrdiff_t k2 = k * 2;
  621. args.emplace_back(TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  622. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  623. args.emplace_back(TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  624. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  625. args.emplace_back(TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  626. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  627. args.emplace_back(TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  628. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  629. args.emplace_back(TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  630. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  631. args.emplace_back(TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  632. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  633. args.emplace_back(TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  634. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  635. args.emplace_back(TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  636. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  637. }
  638. {
  639. // 3d
  640. size_t m = 20, n = 30, k = 40;
  641. ptrdiff_t k2 = k;
  642. args.emplace_back(
  643. TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2},
  644. dtype::Int32()),
  645. TensorLayout({m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2},
  646. dtype::Int32()));
  647. }
  648. {
  649. // simplify_layout
  650. // 234..56
  651. // 2..3456
  652. args.emplace_back(
  653. TensorLayout(
  654. {2, 3, 4, 5, 6},
  655. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  656. dtype::Int32()),
  657. TensorLayout({2, 3, 4, 5, 6},
  658. {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  659. dtype::Int32()));
  660. }
  661. Checker<Relayout> checker(handle_cuda());
  662. for (auto&& arg : args) {
  663. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  664. }
  665. }
  666. TEST_F(CUDA, TRANSPOSE_INT8) {
  667. auto run = [&](TensorLayout layout, std::vector<size_t> per) {
  668. TensorLayout src = layout.dimshuffle(per);
  669. TensorLayout dst = layout;
  670. dst.init_contiguous_stride();
  671. Checker<Relayout> checker(handle_cuda());
  672. checker.exec(TensorLayoutArray{src, dst});
  673. };
  674. //! for last contig(NCHW4<->NCHW32)
  675. run({{5, 8, 4, 3, 8}, dtype::Int8()}, {1, 3, 0, 2, 4});
  676. run({{5, 8, 4, 3, 5}, dtype::Int8()}, {1, 3, 0, 2, 4});
  677. run({{5, 8, 4, 3, 64}, dtype::Int8()}, {1, 3, 0, 2, 4});
  678. //! for last no contig(NCHW->NCHW4)
  679. run({{7, 4, 32}, dtype::Int8()}, {2, 0, 1});
  680. run({{7, 4, 64}, dtype::Int8()}, {2, 0, 1});
  681. run({{7, 4, 7}, dtype::Int8()}, {2, 0, 1});
  682. //! for copy
  683. run({{2, 3, 4, 5, 6},
  684. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  685. dtype::Int8()},
  686. {0, 1, 2, 3, 4});
  687. }
  688. TEST_F(CUDA, RELAYOUT_INT8) {
  689. struct Arg {
  690. TensorLayout src, dst;
  691. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  692. };
  693. std::vector<Arg> args;
  694. {
  695. // contiguous stride
  696. args.emplace_back(TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Int8()),
  697. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Int8()));
  698. args.emplace_back(TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Int8()),
  699. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Int8()));
  700. args.emplace_back(
  701. TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Int8()),
  702. TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Int8()));
  703. }
  704. args.emplace_back(
  705. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()),
  706. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()));
  707. args.emplace_back(
  708. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  709. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()));
  710. args.emplace_back(
  711. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  712. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int8()));
  713. args.emplace_back(
  714. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()),
  715. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()));
  716. args.emplace_back(
  717. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  718. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int8()));
  719. args.emplace_back(
  720. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int8()),
  721. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int8()));
  722. args.emplace_back(
  723. TensorLayout({16, 128, 128}, {49152, 384, 3}, dtype::Int8()),
  724. TensorLayout({16, 128, 128}, {16384, 128, 1}, dtype::Int8()));
  725. {
  726. // 1d
  727. size_t n = 10000;
  728. args.emplace_back(TensorLayout({n}, {1}, dtype::Int8()),
  729. TensorLayout({n}, {1}, dtype::Int8()));
  730. args.emplace_back(TensorLayout({n}, {1}, dtype::Int8()),
  731. TensorLayout({n}, {2}, dtype::Int8()));
  732. args.emplace_back(TensorLayout({n}, {2}, dtype::Int8()),
  733. TensorLayout({n}, {1}, dtype::Int8()));
  734. args.emplace_back(TensorLayout({n}, {2}, dtype::Int8()),
  735. TensorLayout({n}, {2}, dtype::Int8()));
  736. }
  737. {
  738. // 2d
  739. size_t m = 200, n = 300, k = 400;
  740. ptrdiff_t k2 = k * 2;
  741. args.emplace_back(TensorLayout({m, n}, {k2, 2}, dtype::Int8()),
  742. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int8()));
  743. args.emplace_back(TensorLayout({m, n}, {2, k2}, dtype::Int8()),
  744. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int8()));
  745. args.emplace_back(TensorLayout({m, n}, {2, k2}, dtype::Int8()),
  746. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int8()));
  747. args.emplace_back(TensorLayout({m, n}, {k2, 2}, dtype::Int8()),
  748. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int8()));
  749. args.emplace_back(TensorLayout({m, n}, {k2, 1}, dtype::Int8()),
  750. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int8()));
  751. args.emplace_back(TensorLayout({m, n}, {1, k2}, dtype::Int8()),
  752. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int8()));
  753. args.emplace_back(TensorLayout({m, n}, {1, k2}, dtype::Int8()),
  754. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int8()));
  755. args.emplace_back(TensorLayout({m, n}, {k2, 1}, dtype::Int8()),
  756. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int8()));
  757. }
  758. {
  759. // 3d
  760. size_t m = 20, n = 30, k = 40;
  761. ptrdiff_t k2 = k;
  762. args.emplace_back(
  763. TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2},
  764. dtype::Int8()),
  765. TensorLayout({m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2},
  766. dtype::Int8()));
  767. }
  768. {
  769. // simplify_layout
  770. // 234..56
  771. // 2..3456
  772. args.emplace_back(
  773. TensorLayout(
  774. {2, 3, 4, 5, 6},
  775. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  776. dtype::Int8()),
  777. TensorLayout({2, 3, 4, 5, 6},
  778. {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  779. dtype::Int8()));
  780. args.emplace_back(
  781. TensorLayout(
  782. {2, 3, 4, 5, 6},
  783. {4 * 3 * 4 * 5 * 6, 4 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  784. dtype::Int8()),
  785. TensorLayout({2, 3, 4, 5, 6},
  786. {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  787. dtype::Int8()));
  788. }
  789. Checker<Relayout> checker(handle_cuda());
  790. for (auto&& arg : args) {
  791. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  792. }
  793. }
  794. TEST_F(CUDA, RELAYOUT_TEST) {
  795. struct Arg {
  796. TensorLayout src, dst;
  797. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  798. };
  799. std::vector<Arg> args;
  800. //! dst contig
  801. args.emplace_back(TensorLayout({5, 32, 9}, {288, 1, 32}, dtype::Int8()),
  802. TensorLayout({5, 9, 32}, {288, 32, 1}, dtype::Int8()));
  803. args.emplace_back(TensorLayout({5, 9, 32}, {288, 1, 9}, dtype::Int8()),
  804. TensorLayout({5, 32, 9}, {288, 9, 1}, dtype::Int8()));
  805. args.emplace_back(TensorLayout({5, 4, 9}, {36, 1, 4}, dtype::Int8()),
  806. TensorLayout({5, 9, 4}, {36, 4, 1}, dtype::Int8()));
  807. args.emplace_back(TensorLayout({5, 9, 4}, {36, 1, 9}, dtype::Int8()),
  808. TensorLayout({5, 4, 9}, {36, 9, 1}, dtype::Int8()));
  809. args.emplace_back(TensorLayout({5, 32, 4}, {128, 1, 32}, dtype::Int8()),
  810. TensorLayout({5, 4, 32}, {128, 32, 1}, dtype::Int8()));
  811. args.emplace_back(TensorLayout({5, 4, 32}, {128, 1, 4}, dtype::Int8()),
  812. TensorLayout({5, 32, 4}, {128, 4, 1}, dtype::Int8()));
  813. args.emplace_back(TensorLayout({5, 7, 5}, {35, 1, 7}, dtype::Int8()),
  814. TensorLayout({5, 5, 7}, {35, 7, 1}, dtype::Int8()));
  815. args.emplace_back(TensorLayout({5, 5, 7}, {35, 1, 5}, dtype::Int8()),
  816. TensorLayout({5, 7, 5}, {35, 5, 1}, dtype::Int8()));
  817. //! src contig
  818. args.emplace_back(TensorLayout({5, 9, 32}, {288, 32, 1}, dtype::Int8()),
  819. TensorLayout({5, 32, 9}, {288, 1, 32}, dtype::Int8()));
  820. args.emplace_back(TensorLayout({5, 32, 9}, {288, 9, 1}, dtype::Int8()),
  821. TensorLayout({5, 9, 32}, {288, 1, 9}, dtype::Int8()));
  822. args.emplace_back(TensorLayout({5, 9, 4}, {36, 4, 1}, dtype::Int8()),
  823. TensorLayout({5, 4, 9}, {36, 1, 4}, dtype::Int8()));
  824. args.emplace_back(TensorLayout({5, 4, 9}, {36, 9, 1}, dtype::Int8()),
  825. TensorLayout({5, 9, 4}, {36, 1, 9}, dtype::Int8()));
  826. args.emplace_back(TensorLayout({5, 4, 32}, {128, 32, 1}, dtype::Int8()),
  827. TensorLayout({5, 32, 4}, {128, 1, 32}, dtype::Int8()));
  828. args.emplace_back(TensorLayout({5, 32, 4}, {128, 4, 1}, dtype::Int8()),
  829. TensorLayout({5, 4, 32}, {128, 1, 4}, dtype::Int8()));
  830. args.emplace_back(TensorLayout({5, 5, 7}, {35, 7, 1}, dtype::Int8()),
  831. TensorLayout({5, 7, 5}, {35, 1, 7}, dtype::Int8()));
  832. args.emplace_back(TensorLayout({5, 7, 5}, {35, 5, 1}, dtype::Int8()),
  833. TensorLayout({5, 5, 7}, {35, 1, 5}, dtype::Int8()));
  834. //! cross
  835. args.emplace_back(
  836. TensorLayout({5, 9, 32}, {288 * 4, 32 * 3, 1}, dtype::Int8()),
  837. TensorLayout({5, 32, 9}, {288 * 4, 1, 32 * 3}, dtype::Int8()));
  838. args.emplace_back(
  839. TensorLayout({5, 32, 9}, {288 * 3, 9 * 2, 1}, dtype::Int8()),
  840. TensorLayout({5, 9, 32}, {288 * 3, 1, 9 * 2}, dtype::Int8()));
  841. args.emplace_back(
  842. TensorLayout({5, 9, 4}, {36 * 10, 4 * 7, 1}, dtype::Int8()),
  843. TensorLayout({5, 4, 9}, {36 * 10, 1, 4 * 7}, dtype::Int8()));
  844. Checker<Relayout> checker(handle_cuda());
  845. for (auto&& arg : args) {
  846. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  847. }
  848. }
  849. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台