You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout.cpp 9.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. /**
  2. * \file dnn/test/rocm/relayout.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "hcc_detail/hcc_defs_prologue.h"
  12. #include "megdnn/oprs.h"
  13. #include "test/common/benchmarker.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/relayout.h"
  16. #include "test/rocm/benchmarker.h"
  17. #include "test/rocm/fixture.h"
  18. using namespace megdnn;
  19. using namespace test;
  20. namespace {
  21. template <typename tag>
  22. class ROCM_RELAYOUT : public ROCM {};
  23. TYPED_TEST_CASE(ROCM_RELAYOUT, relayout::test_types);
  24. TYPED_TEST(ROCM_RELAYOUT, run) {
  25. relayout::run_test<TypeParam>(this->handle_rocm());
  26. }
  27. } // namespace
  28. TEST_F(ROCM, RELAYOUT_MEMCPY_ASYNC) {
  29. Checker<Relayout> checker(handle_rocm());
  30. checker.set_epsilon(1e-3);
  31. struct Arg {
  32. TensorLayout src, dst;
  33. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  34. };
  35. std::vector<Arg> args;
  36. // test for contig
  37. args.emplace_back(
  38. Arg{{{51200}, {1}, dtype::Float32()}, {{51200}, {1}, dtype::Float32()}});
  39. // test for copy_2d
  40. args.emplace_back(
  41. Arg{{{51200}, {9}, dtype::Float32()}, {{51200}, {1}, dtype::Float32()}});
  42. for (auto&& arg : args) {
  43. checker.set_dtype(0, dtype::Float32())
  44. .set_dtype(1, dtype::Float32())
  45. .execl({arg.src, arg.dst});
  46. }
  47. }
  48. #if MEGDNN_WITH_BENCHMARK
  49. TEST_F(ROCM, RELAYOUT_BENCHMARK) {
  50. //! benchmark contious layout, such as (a, b, c, d) -> (b, a, c,d)
  51. //! just change the first two axis
  52. megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
  53. auto benchmarker =
  54. ROCMBenchmarker<RelayoutForward>(handle_rocm(), handle_naive(false));
  55. benchmarker.set_display(true);
  56. auto run = [&](const TensorLayoutArray& layouts) {
  57. for (auto&& layout : layouts) {
  58. TensorLayout src = layout.dimshuffle({1, 0, 2});
  59. TensorLayout dst = layout;
  60. std::swap(dst.shape[0], dst.shape[1]);
  61. dst.init_contiguous_stride();
  62. benchmarker.execl({src, dst});
  63. auto used = benchmarker.execl({src, dst});
  64. used = benchmarker.execl({src, dst});
  65. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  66. 2 * layout.total_nr_elems() * layout.dtype.size() / used * 1000 /
  67. (1024 * 1024 * 1024));
  68. }
  69. };
  70. TensorLayoutArray layouts = {
  71. {{12, 23, 2}, dtype::Int32()}, {{12, 23, 8}, dtype::Int32()},
  72. {{12, 23, 17}, dtype::Int32()}, {{12, 23, 64}, dtype::Int32()},
  73. {{12, 23, 129}, dtype::Int32()}, {{12, 23, 256}, dtype::Int32()},
  74. {{12, 23, 1029}, dtype::Int32()}, {{12, 23, 4096}, dtype::Int32()},
  75. {{12, 23, 9143}, dtype::Int32()}, {{12, 23, 18284}, dtype::Int32()},
  76. {{2, 2, 1000000}, dtype::Int32()},
  77. };
  78. run(layouts);
  79. auto run2 = [&](const TensorLayoutArray& layouts) {
  80. for (auto&& layout : layouts) {
  81. TensorLayout src = layout.dimshuffle({0, 2, 1, 3});
  82. TensorLayout dst = layout;
  83. std::swap(dst.shape[0], dst.shape[1]);
  84. dst.init_contiguous_stride();
  85. benchmarker.execl({src, dst});
  86. auto used = benchmarker.execl({src, dst});
  87. used = benchmarker.execl({src, dst});
  88. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  89. 2 * layout.total_nr_elems() * layout.dtype.size() / used * 1000 /
  90. (1024 * 1024 * 1024));
  91. }
  92. };
  93. layouts = {
  94. {{3, 12, 24, 100}, dtype::Int32()},
  95. {{3, 12, 24, 1029}, dtype::Int32()},
  96. {{3, 4, 24, 9143}, dtype::Int32()},
  97. {{3, 4, 24, 18284}, dtype::Int32()},
  98. };
  99. run2(layouts);
  100. }
  101. TEST_F(ROCM, RELAYOUT_LAST_CONTIG_BENCHMARK) {
  102. megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
  103. auto benchmarker =
  104. ROCMBenchmarker<RelayoutForward>(handle_rocm(), handle_naive(false));
  105. benchmarker.set_display(true);
  106. TensorLayout src =
  107. TensorLayout({5, 5, 100000}, {800000, 100000, 1}, dtype::Float32());
  108. TensorLayout dst =
  109. TensorLayout({5, 5, 100000}, {700000, 100000, 1}, dtype::Float32());
  110. benchmarker.execl({src, dst});
  111. auto used = benchmarker.execl({src, dst});
  112. used = benchmarker.execl({src, dst});
  113. printf("src: %s dst: %s bandwith: %f gbps/s\n", src.to_string().c_str(),
  114. dst.to_string().c_str(),
  115. 2 * src.total_nr_elems() * src.dtype.size() / used * 1000 /
  116. (1024 * 1024 * 1024));
  117. }
  118. #endif
  119. TEST_F(ROCM, RELAYOUT) {
  120. struct Arg {
  121. TensorLayout src, dst;
  122. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  123. };
  124. std::vector<Arg> args;
  125. #if !MEGDNN_DISABLE_FLOAT16
  126. {
  127. // contiguous stride
  128. args.emplace_back(
  129. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()),
  130. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()));
  131. args.emplace_back(
  132. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()),
  133. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()));
  134. args.emplace_back(
  135. TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Float16()),
  136. TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Float16()));
  137. }
  138. args.emplace_back(
  139. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()),
  140. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()));
  141. args.emplace_back(
  142. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  143. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()));
  144. args.emplace_back(
  145. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  146. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Float16()));
  147. #endif
  148. args.emplace_back(
  149. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()),
  150. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()));
  151. args.emplace_back(
  152. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  153. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()));
  154. args.emplace_back(
  155. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  156. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int32()));
  157. {
  158. // 1d
  159. size_t n = 10000;
  160. args.emplace_back(
  161. TensorLayout({n}, {1}, dtype::Int32()),
  162. TensorLayout({n}, {1}, dtype::Int32()));
  163. args.emplace_back(
  164. TensorLayout({n}, {1}, dtype::Int32()),
  165. TensorLayout({n}, {2}, dtype::Int32()));
  166. args.emplace_back(
  167. TensorLayout({n}, {2}, dtype::Int32()),
  168. TensorLayout({n}, {1}, dtype::Int32()));
  169. args.emplace_back(
  170. TensorLayout({n}, {2}, dtype::Int32()),
  171. TensorLayout({n}, {2}, dtype::Int32()));
  172. }
  173. {
  174. // 2d
  175. size_t m = 200, n = 300, k = 400;
  176. ptrdiff_t k2 = k * 2;
  177. args.emplace_back(
  178. TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  179. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  180. args.emplace_back(
  181. TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  182. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  183. args.emplace_back(
  184. TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  185. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  186. args.emplace_back(
  187. TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  188. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  189. args.emplace_back(
  190. TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  191. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  192. args.emplace_back(
  193. TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  194. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  195. args.emplace_back(
  196. TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  197. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  198. args.emplace_back(
  199. TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  200. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  201. }
  202. {
  203. // 3d
  204. size_t m = 20, n = 30, k = 40;
  205. ptrdiff_t k2 = k;
  206. args.emplace_back(
  207. TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()),
  208. TensorLayout(
  209. {m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()));
  210. }
  211. {
  212. // simplify_layout
  213. // 234..56
  214. // 2..3456
  215. args.emplace_back(
  216. TensorLayout(
  217. {2, 3, 4, 5, 6},
  218. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  219. dtype::Int32()),
  220. TensorLayout(
  221. {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  222. dtype::Int32()));
  223. }
  224. Checker<Relayout> checker(handle_rocm());
  225. for (auto&& arg : args) {
  226. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  227. }
  228. }
  229. // vim: syntax=cpp.doxygen