You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout.cpp 9.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. /**
  2. * \file dnn/test/rocm/relayout.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "hcc_detail/hcc_defs_prologue.h"
  12. #include "test/rocm/fixture.h"
  13. #include "megdnn/oprs.h"
  14. #include "test/common/checker.h"
  15. #include "test/common/benchmarker.h"
  16. #include "test/common/relayout.h"
  17. #include "test/rocm/benchmarker.h"
  18. using namespace megdnn;
  19. using namespace test;
  20. namespace {
  21. template<typename tag>
  22. class ROCM_RELAYOUT: public ROCM {
  23. };
  24. TYPED_TEST_CASE(ROCM_RELAYOUT, relayout::test_types);
  25. TYPED_TEST(ROCM_RELAYOUT, run) {
  26. relayout::run_test<TypeParam>(this->handle_rocm());
  27. }
  28. }
  29. TEST_F(ROCM, RELAYOUT_MEMCPY_ASYNC) {
  30. Checker<Relayout> checker(handle_rocm());
  31. checker.set_epsilon(1e-3);
  32. struct Arg {
  33. TensorLayout src, dst;
  34. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  35. };
  36. std::vector<Arg> args;
  37. // test for contig
  38. args.emplace_back(Arg{{{51200}, {1}, dtype::Float32()},
  39. {{51200}, {1}, dtype::Float32()}});
  40. // test for copy_2d
  41. args.emplace_back(Arg{{{51200}, {9}, dtype::Float32()},
  42. {{51200}, {1}, dtype::Float32()}});
  43. for (auto&& arg : args) {
  44. checker.set_dtype(0, dtype::Float32())
  45. .set_dtype(1, dtype::Float32())
  46. .execl({arg.src, arg.dst});
  47. }
  48. }
  49. #if MEGDNN_WITH_BENCHMARK
  50. TEST_F(ROCM, RELAYOUT_BENCHMARK) {
  51. //! benchmark contious layout, such as (a, b, c, d) -> (b, a, c,d)
  52. //! just change the first two axis
  53. megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
  54. auto benchmarker = ROCMBenchmarker<RelayoutForward>(handle_rocm(),
  55. handle_naive(false));
  56. benchmarker.set_display(true);
  57. auto run = [&](const TensorLayoutArray& layouts) {
  58. for (auto&& layout : layouts) {
  59. TensorLayout src = layout.dimshuffle({1, 0, 2});
  60. TensorLayout dst = layout;
  61. std::swap(dst.shape[0], dst.shape[1]);
  62. dst.init_contiguous_stride();
  63. benchmarker.execl({src, dst});
  64. auto used = benchmarker.execl({src, dst});
  65. used = benchmarker.execl({src, dst});
  66. printf("layout: %s bandwith: %f gbps/s\n",
  67. layout.to_string().c_str(),
  68. 2 * layout.total_nr_elems() * layout.dtype.size() / used *
  69. 1000 / (1024 * 1024 * 1024));
  70. }
  71. };
  72. TensorLayoutArray layouts = {
  73. {{12, 23, 2}, dtype::Int32()},
  74. {{12, 23, 8}, dtype::Int32()},
  75. {{12, 23, 17}, dtype::Int32()},
  76. {{12, 23, 64}, dtype::Int32()},
  77. {{12, 23, 129}, dtype::Int32()},
  78. {{12, 23, 256}, dtype::Int32()},
  79. {{12, 23, 1029}, dtype::Int32()},
  80. {{12, 23, 4096}, dtype::Int32()},
  81. {{12, 23, 9143}, dtype::Int32()},
  82. {{12, 23, 18284}, dtype::Int32()},
  83. {{2, 2, 1000000}, dtype::Int32()},
  84. };
  85. run(layouts);
  86. auto run2 = [&](const TensorLayoutArray& layouts) {
  87. for (auto&& layout : layouts) {
  88. TensorLayout src = layout.dimshuffle({0, 2, 1, 3});
  89. TensorLayout dst = layout;
  90. std::swap(dst.shape[0], dst.shape[1]);
  91. dst.init_contiguous_stride();
  92. benchmarker.execl({src, dst});
  93. auto used = benchmarker.execl({src, dst});
  94. used = benchmarker.execl({src, dst});
  95. printf("layout: %s bandwith: %f gbps/s\n",
  96. layout.to_string().c_str(),
  97. 2 * layout.total_nr_elems() * layout.dtype.size() / used *
  98. 1000 / (1024 * 1024 * 1024));
  99. }
  100. };
  101. layouts = {
  102. {{3, 12, 24, 100}, dtype::Int32()},
  103. {{3, 12, 24, 1029}, dtype::Int32()},
  104. {{3, 4, 24, 9143}, dtype::Int32()},
  105. {{3, 4, 24, 18284}, dtype::Int32()},
  106. };
  107. run2(layouts);
  108. }
  109. TEST_F(ROCM, RELAYOUT_LAST_CONTIG_BENCHMARK) {
  110. megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true);
  111. auto benchmarker = ROCMBenchmarker<RelayoutForward>(handle_rocm(),
  112. handle_naive(false));
  113. benchmarker.set_display(true);
  114. TensorLayout src =
  115. TensorLayout({5, 5, 100000}, {800000, 100000, 1}, dtype::Float32());
  116. TensorLayout dst =
  117. TensorLayout({5, 5, 100000}, {700000, 100000, 1}, dtype::Float32());
  118. benchmarker.execl({src, dst});
  119. auto used = benchmarker.execl({src, dst});
  120. used = benchmarker.execl({src, dst});
  121. printf("src: %s dst: %s bandwith: %f gbps/s\n", src.to_string().c_str(),
  122. dst.to_string().c_str(),
  123. 2 * src.total_nr_elems() * src.dtype.size() / used * 1000 /
  124. (1024 * 1024 * 1024));
  125. }
  126. #endif
  127. TEST_F(ROCM, RELAYOUT) {
  128. struct Arg {
  129. TensorLayout src, dst;
  130. Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {}
  131. };
  132. std::vector<Arg> args;
  133. #if !MEGDNN_DISABLE_FLOAT16
  134. {
  135. // contiguous stride
  136. args.emplace_back(TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()),
  137. TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()));
  138. args.emplace_back(TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()),
  139. TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()));
  140. args.emplace_back(
  141. TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Float16()),
  142. TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Float16()));
  143. }
  144. args.emplace_back(
  145. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()),
  146. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()));
  147. args.emplace_back(
  148. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  149. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()));
  150. args.emplace_back(
  151. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()),
  152. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Float16()));
  153. #endif
  154. args.emplace_back(
  155. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()),
  156. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()));
  157. args.emplace_back(
  158. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  159. TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()));
  160. args.emplace_back(
  161. TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()),
  162. TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int32()));
  163. {
  164. // 1d
  165. size_t n = 10000;
  166. args.emplace_back(TensorLayout({n}, {1}, dtype::Int32()),
  167. TensorLayout({n}, {1}, dtype::Int32()));
  168. args.emplace_back(TensorLayout({n}, {1}, dtype::Int32()),
  169. TensorLayout({n}, {2}, dtype::Int32()));
  170. args.emplace_back(TensorLayout({n}, {2}, dtype::Int32()),
  171. TensorLayout({n}, {1}, dtype::Int32()));
  172. args.emplace_back(TensorLayout({n}, {2}, dtype::Int32()),
  173. TensorLayout({n}, {2}, dtype::Int32()));
  174. }
  175. {
  176. // 2d
  177. size_t m = 200, n = 300, k = 400;
  178. ptrdiff_t k2 = k * 2;
  179. args.emplace_back(TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  180. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  181. args.emplace_back(TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  182. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  183. args.emplace_back(TensorLayout({m, n}, {2, k2}, dtype::Int32()),
  184. TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32()));
  185. args.emplace_back(TensorLayout({m, n}, {k2, 2}, dtype::Int32()),
  186. TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32()));
  187. args.emplace_back(TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  188. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  189. args.emplace_back(TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  190. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  191. args.emplace_back(TensorLayout({m, n}, {1, k2}, dtype::Int32()),
  192. TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32()));
  193. args.emplace_back(TensorLayout({m, n}, {k2, 1}, dtype::Int32()),
  194. TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32()));
  195. }
  196. {
  197. // 3d
  198. size_t m = 20, n = 30, k = 40;
  199. ptrdiff_t k2 = k;
  200. args.emplace_back(
  201. TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2},
  202. dtype::Int32()),
  203. TensorLayout({m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2},
  204. dtype::Int32()));
  205. }
  206. {
  207. // simplify_layout
  208. // 234..56
  209. // 2..3456
  210. args.emplace_back(
  211. TensorLayout(
  212. {2, 3, 4, 5, 6},
  213. {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1},
  214. dtype::Int32()),
  215. TensorLayout({2, 3, 4, 5, 6},
  216. {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1},
  217. dtype::Int32()));
  218. }
  219. Checker<Relayout> checker(handle_rocm());
  220. for (auto&& arg : args) {
  221. checker.exec(TensorLayoutArray{arg.src, arg.dst});
  222. }
  223. }
  224. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台