You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout.cpp 9.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. #include "hcc_detail/hcc_defs_prologue.h"
  2. #include "megdnn/oprs.h"
  3. #include "test/common/benchmarker.h"
  4. #include "test/common/checker.h"
  5. #include "test/common/relayout.h"
  6. #include "test/rocm/benchmarker.h"
  7. #include "test/rocm/fixture.h"
  8. using namespace megdnn;
  9. using namespace test;
  10. namespace {
  11. template <typename tag>
  12. class ROCM_RELAYOUT : public ROCM {};
  13. TYPED_TEST_CASE(ROCM_RELAYOUT, relayout::test_types);
  14. TYPED_TEST(ROCM_RELAYOUT, run) {
  15. relayout::run_test<TypeParam>(this->handle_rocm());
  16. }
  17. } // namespace
  18. TEST_F(ROCM, RELAYOUT_MEMCPY_ASYNC) {
  19. Checker<Relayout> checker(handle_rocm());
  20. checker.set_epsilon(1e-3);
  21. struct Arg {
  22. TensorLayout src, dst;
  23. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  24. };
  25. std::vector<Arg> args;
  26. // test for contig
  27. args.emplace_back(
  28. Arg{{{51200}, {1}, dtype::Float32()}, {{51200}, {1}, dtype::Float32()}});
  29. // test for copy_2d
  30. args.emplace_back(
  31. Arg{{{51200}, {9}, dtype::Float32()}, {{51200}, {1}, dtype::Float32()}});
  32. for (auto&& arg : args) {
  33. checker.set_dtype(0, dtype::Float32())
  34. .set_dtype(1, dtype::Float32())
  35. .execl({arg.src, arg.dst});
  36. }
  37. }
  38. #if MEGDNN_WITH_BENCHMARK
  39. TEST_F(ROCM, RELAYOUT_BENCHMARK) {
  40. //! benchmark contious layout, such as (a, b, c, d) -> (b, a, c,d)
  41. //! just change the first two axis
  42. megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
  43. auto benchmarker =
  44. ROCMBenchmarker<RelayoutForward>(handle_rocm(), handle_naive(false));
  45. benchmarker.set_display(true);
  46. auto run = [&](const TensorLayoutArray& layouts) {
  47. for (auto&& layout : layouts) {
  48. TensorLayout src = layout.dimshuffle({1, 0, 2});
  49. TensorLayout dst = layout;
  50. std::swap(dst.shape[0], dst.shape[1]);
  51. dst.init_contiguous_stride();
  52. benchmarker.execl({src, dst});
  53. auto used = benchmarker.execl({src, dst});
  54. used = benchmarker.execl({src, dst});
  55. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  56. 2 * layout.total_nr_elems() * layout.dtype.size() / used * 1000 /
  57. (1024 * 1024 * 1024));
  58. }
  59. };
  60. TensorLayoutArray layouts = {
  61. {{12, 23, 2}, dtype::Int32()}, {{12, 23, 8}, dtype::Int32()},
  62. {{12, 23, 17}, dtype::Int32()}, {{12, 23, 64}, dtype::Int32()},
  63. {{12, 23, 129}, dtype::Int32()}, {{12, 23, 256}, dtype::Int32()},
  64. {{12, 23, 1029}, dtype::Int32()}, {{12, 23, 4096}, dtype::Int32()},
  65. {{12, 23, 9143}, dtype::Int32()}, {{12, 23, 18284}, dtype::Int32()},
  66. {{2, 2, 1000000}, dtype::Int32()},
  67. };
  68. run(layouts);
  69. auto run2 = [&](const TensorLayoutArray& layouts) {
  70. for (auto&& layout : layouts) {
  71. TensorLayout src = layout.dimshuffle({0, 2, 1, 3});
  72. TensorLayout dst = layout;
  73. std::swap(dst.shape[0], dst.shape[1]);
  74. dst.init_contiguous_stride();
  75. benchmarker.execl({src, dst});
  76. auto used = benchmarker.execl({src, dst});
  77. used = benchmarker.execl({src, dst});
  78. printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
  79. 2 * layout.total_nr_elems() * layout.dtype.size() / used * 1000 /
  80. (1024 * 1024 * 1024));
  81. }
  82. };
  83. layouts = {
  84. {{3, 12, 24, 100}, dtype::Int32()},
  85. {{3, 12, 24, 1029}, dtype::Int32()},
  86. {{3, 4, 24, 9143}, dtype::Int32()},
  87. {{3, 4, 24, 18284}, dtype::Int32()},
  88. };
  89. run2(layouts);
  90. }
  91. TEST_F(ROCM, RELAYOUT_LAST_CONTIG_BENCHMARK) {
  92. megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
  93. auto benchmarker =
  94. ROCMBenchmarker<RelayoutForward>(handle_rocm(), handle_naive(false));
  95. benchmarker.set_display(true);
  96. TensorLayout src =
  97. TensorLayout({5, 5, 100000}, {800000, 100000, 1}, dtype::Float32());
  98. TensorLayout dst =
  99. TensorLayout({5, 5, 100000}, {700000, 100000, 1}, dtype::Float32());
  100. benchmarker.execl({src, dst});
  101. auto used = benchmarker.execl({src, dst});
  102. used = benchmarker.execl({src, dst});
  103. printf("src: %s dst: %s bandwith: %f gbps/s\n", src.to_string().c_str(),
  104. dst.to_string().c_str(),
  105. 2 * src.total_nr_elems() * src.dtype.size() / used * 1000 /
  106. (1024 * 1024 * 1024));
  107. }
  108. #endif
  109. TEST_F(ROCM, RELAYOUT) {
  110. struct Arg {
  111. TensorLayout src, dst;
  112. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  113. };
  114. std::vector<Arg> args;
  115. #if !MEGDNN_DISABLE_FLOAT16
  116. {
  117. // contiguous stride
  118. args.emplace_back(
  119. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()),
  120. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()));
  121. args.emplace_back(
  122. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()),
  123. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()));
  124. args.emplace_back(
  125. TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Float16()),
  126. TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Float16()));
  127. }
  128. args.emplace_back(
  129. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()),
  130. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()));
  131. args.emplace_back(
  132. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  133. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()));
  134. args.emplace_back(
  135. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  136. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Float16()));
  137. #endif
  138. args.emplace_back(
  139. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()),
  140. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()));
  141. args.emplace_back(
  142. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  143. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()));
  144. args.emplace_back(
  145. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  146. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int32()));
  147. {
  148. // 1d
  149. size_t n = 10000;
  150. args.emplace_back(
  151. TensorLayout({n}, {1}, dtype::Int32()),
  152. TensorLayout({n}, {1}, dtype::Int32()));
  153. args.emplace_back(
  154. TensorLayout({n}, {1}, dtype::Int32()),
  155. TensorLayout({n}, {2}, dtype::Int32()));
  156. args.emplace_back(
  157. TensorLayout({n}, {2}, dtype::Int32()),
  158. TensorLayout({n}, {1}, dtype::Int32()));
  159. args.emplace_back(
  160. TensorLayout({n}, {2}, dtype::Int32()),
  161. TensorLayout({n}, {2}, dtype::Int32()));
  162. }
  163. {
  164. // 2d
  165. size_t m = 200, n = 300, k = 400;
  166. ptrdiff_t k2 = k * 2;
  167. args.emplace_back(
  168. TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  169. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  170. args.emplace_back(
  171. TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  172. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  173. args.emplace_back(
  174. TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  175. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  176. args.emplace_back(
  177. TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  178. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  179. args.emplace_back(
  180. TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  181. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  182. args.emplace_back(
  183. TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  184. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  185. args.emplace_back(
  186. TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  187. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  188. args.emplace_back(
  189. TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  190. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  191. }
  192. {
  193. // 3d
  194. size_t m = 20, n = 30, k = 40;
  195. ptrdiff_t k2 = k;
  196. args.emplace_back(
  197. TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()),
  198. TensorLayout(
  199. {m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()));
  200. }
  201. {
  202. // simplify_layout
  203. // 234..56
  204. // 2..3456
  205. args.emplace_back(
  206. TensorLayout(
  207. {2, 3, 4, 5, 6},
  208. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  209. dtype::Int32()),
  210. TensorLayout(
  211. {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  212. dtype::Int32()));
  213. }
  214. Checker<Relayout> checker(handle_rocm());
  215. for (auto&& arg : args) {
  216. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  217. }
  218. }
  219. // vim: syntax=cpp.doxygen