You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout.cpp 8.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. #include "test/fallback/fixture.h"
  2. #include "test/common/checker.h"
  3. #include "test/common/relayout.h"
  4. #include "test/common/tensor.h"
  5. #include <ctime>
  6. #include "megdnn/basic_types.h"
  7. #include "test/common/task_record_check.h"
  8. using namespace megdnn;
  9. using namespace test;
  10. namespace {
  11. template <typename tag>
  12. class FALLBACK_RELAYOUT : public FALLBACK {};
  13. TYPED_TEST_CASE(FALLBACK_RELAYOUT, relayout::test_types);
  14. TYPED_TEST(FALLBACK_RELAYOUT, run) {
  15. relayout::run_test<TypeParam>(this->handle());
  16. }
  17. } // namespace
  18. TEST_F(FALLBACK, RELAYOUT_CONTINUE) {
  19. Checker<Relayout> checker(handle());
  20. checker.set_dtype(0, dtype::Int32());
  21. checker.set_dtype(1, dtype::Int32());
  22. checker.exec({{2, 2, 2}, {2, 2, 2}});
  23. }
  24. TEST_F(FALLBACK, RELAYOUT_RECORD) {
  25. TaskRecordChecker<Relayout> checker(1);
  26. checker.set_dtype(0, dtype::Int32());
  27. checker.set_dtype(1, dtype::Int32());
  28. checker.exec({{2, 2, 2}, {2, 2, 2}});
  29. }
  30. TEST_F(FALLBACK, RELAYOUT_Q4) {
  31. Checker<Relayout> checker(handle());
  32. UniformIntRNG rng_int4{-7, 7};
  33. checker.set_rng(0, &rng_int4)
  34. .set_rng(1, &rng_int4)
  35. .set_dtype(0, dtype::QuantizedS4(1.f))
  36. .set_dtype(1, dtype::QuantizedS4(1.f))
  37. .execs({{2, 2, 1, 1}, {1, 1, 2, 2}})
  38. .execs({{1, 64, 15, 15}, {1, 15, 15, 64}})
  39. .execs({{1, 5, 9, 32}, {1, 5, 32, 9}})
  40. .execl(TensorLayoutArray{
  41. {{6400}, {1}, dtype::QuantizedS4{1.f}},
  42. {{20, 320}, {1024, 1}, dtype::QuantizedS4{1.f}}})
  43. .execl(TensorLayoutArray{
  44. {{156}, {1}, dtype::QuantizedS4{1.f}},
  45. {{13, 3, 4}, {16, 1, 4}, dtype::QuantizedS4{1.f}}})
  46. .execl(TensorLayoutArray{
  47. {{48}, {1}, dtype::QuantizedS4{1.f}},
  48. {{3, 4, 4}, {16, 1, 4}, dtype::QuantizedS4{1.f}}})
  49. .execl(TensorLayoutArray{
  50. {{84}, {1}, dtype::QuantizedS4{1.f}},
  51. {{3, 4, 7}, {28, 1, 4}, dtype::QuantizedS4{1.f}}})
  52. .execl(TensorLayoutArray{
  53. {{336}, {1}, dtype::QuantizedS4{1.f}},
  54. {{3, 4, 7, 4}, {112, 4, 16, 1}, dtype::QuantizedS4{1.f}}})
  55. .execl(TensorLayoutArray{
  56. {{54}, {1}, dtype::QuantizedS4{1.f}},
  57. {{6, 3, 3}, {16, 4, 1}, dtype::QuantizedS4{1.f}}})
  58. .execl(TensorLayoutArray{
  59. {{1200, 3}, {4, 1}, dtype::QuantizedS4{1.f}},
  60. {{20, 60, 3}, {256, 4, 1}, dtype::QuantizedS4{1.f}}})
  61. .execl(TensorLayoutArray{
  62. {{20, 20, 3, 3}, {256, 12, 4, 1}, dtype::QuantizedS4{1.f}},
  63. {{1200, 3}, {4, 1}, dtype::QuantizedS4{1.f}}})
  64. .execl(TensorLayoutArray{
  65. {{5, 16, 7, 7, 4}, {3136, 196, 28, 4, 1}, dtype::QuantizedS4{1.f}},
  66. {{5, 16, 7, 7, 4}, {3136, 4, 448, 64, 1}, dtype::QuantizedS4{1.f}}})
  67. .execl(TensorLayoutArray{
  68. {{5, 7, 7, 16, 4}, {3136, 448, 64, 4, 1}, dtype::QuantizedS4{1.f}},
  69. {{5, 7, 7, 16, 4}, {3136, 28, 4, 196, 1}, dtype::QuantizedS4{1.f}}})
  70. .execl(TensorLayoutArray{
  71. {{5, 2, 7, 7, 32},
  72. {3136, 1568, 224, 32, 1},
  73. dtype::QuantizedS4{1.f}},
  74. {{5, 2, 7, 7, 32},
  75. {3136, 32, 448, 64, 1},
  76. dtype::QuantizedS4{1.f}}})
  77. .execl(TensorLayoutArray{
  78. {{5, 7, 7, 2, 32}, {3136, 448, 64, 32, 1}, dtype::QuantizedS4{1.f}},
  79. {{5, 7, 7, 2, 32},
  80. {3136, 224, 32, 1568, 1},
  81. dtype::QuantizedS4{1.f}}});
  82. }
  83. #if MEGDNN_WITH_BENCHMARK
  84. TEST_F(FALLBACK, BENCHMARK_RELAYOUT_CV) {
  85. relayout::run_cv_benchmark(handle());
  86. }
  87. TEST_F(FALLBACK, BENCHMARK_RELAYOUT) {
  88. auto naive_handle = create_cpu_handle(2);
  89. bool verbose = false;
  90. auto run = [&](bool out_cont, const TensorLayout& cont_layout,
  91. const TensorLayout& noncont_layout) {
  92. megdnn_assert(
  93. cont_layout.dtype == dtype::Int32() &&
  94. noncont_layout.dtype == dtype::Int32() &&
  95. noncont_layout.span().low_byte == 0);
  96. auto noncont_storage_size = noncont_layout.span().high_elem;
  97. Tensor<dt_int32> noncont_storage0(
  98. handle(), {{noncont_storage_size}, dtype::Int32()}),
  99. noncont_storage1(handle(), {{noncont_storage_size}, dtype::Int32()}),
  100. cont_storage0(handle(), cont_layout),
  101. cont_storage1(handle(), cont_layout);
  102. auto noncont0 = noncont_storage0.tensornd(),
  103. noncont1 = noncont_storage1.tensornd();
  104. noncont0.layout = noncont_layout;
  105. noncont1.layout = noncont_layout;
  106. TensorND src, dst0, dst1;
  107. if (out_cont) {
  108. src = noncont0;
  109. dst0 = cont_storage0.tensornd();
  110. dst1 = cont_storage1.tensornd();
  111. auto ptr = src.ptr<int>();
  112. for (size_t i = 0; i < noncont_storage_size; ++i) {
  113. ptr[i] = i;
  114. }
  115. } else {
  116. memset(noncont_storage0.ptr(), -1,
  117. noncont_storage0.layout().span().dist_byte());
  118. memset(noncont_storage1.ptr(), -1,
  119. noncont_storage1.layout().span().dist_byte());
  120. src = cont_storage0.tensornd();
  121. dst0 = noncont0;
  122. dst1 = noncont1;
  123. auto ptr = src.ptr<int>();
  124. for (size_t i = 0, it = src.layout.total_nr_elems(); i < it; ++i) {
  125. ptr[i] = i;
  126. }
  127. }
  128. auto opr_cur = handle()->create_operator<Relayout>();
  129. auto opr_naive = naive_handle->create_operator<Relayout>();
  130. auto timeit = [&src](Relayout* opr, TensorND out) {
  131. opr->exec(src, out);
  132. auto start = clock();
  133. opr->exec(src, out);
  134. auto stop = clock();
  135. return (stop - start) * 1e3 / CLOCKS_PER_SEC;
  136. };
  137. auto t1 = timeit(opr_naive.get(), dst1), t0 = timeit(opr_cur.get(), dst0);
  138. double tot_size_gb_ms = cont_layout.total_nr_elems() * sizeof(int) / 1024.0 /
  139. 1024.0 / 1024.0 * 1e3;
  140. if (verbose) {
  141. printf("noncont-%zu dir=%d: fallback=%7.3fms,%5.2fGiB/s "
  142. "naive=%7.3fms,%5.2fGiB/s\n",
  143. noncont_layout.collapse_contiguous().ndim, out_cont, t0,
  144. tot_size_gb_ms / t0, t1, tot_size_gb_ms / t1);
  145. }
  146. ASSERT_EQ(
  147. 0, memcmp(dst0.ptr<int>(), dst1.ptr<int>(),
  148. dst0.layout.span().dist_byte()));
  149. };
  150. auto run_preset = [&](const TensorShape& noncont_shp, int swap, bool sub,
  151. bool out_cont) {
  152. TensorLayout noncont_layout(noncont_shp, dtype::Int32());
  153. if (swap) {
  154. auto a = swap - 1, b = swap;
  155. std::swap(noncont_layout.shape[a], noncont_layout.shape[b]);
  156. std::swap(noncont_layout.stride[a], noncont_layout.stride[b]);
  157. }
  158. TensorLayout cont_layout = noncont_layout;
  159. cont_layout.init_contiguous_stride();
  160. TensorShape noncont_storage_shp(cont_layout);
  161. if (sub) {
  162. ++noncont_storage_shp[noncont_layout.ndim - 1];
  163. noncont_layout.init_contiguous_stride(noncont_storage_shp);
  164. --noncont_layout.shape[noncont_layout.ndim - 1];
  165. }
  166. run(out_cont, cont_layout, noncont_layout);
  167. };
  168. for (bool out_cont : {false, true}) {
  169. verbose = false;
  170. run_preset({2, 3}, 1, false, out_cont);
  171. run_preset({2, 2, 2}, 0, true, out_cont);
  172. {
  173. // padding-like
  174. TensorLayout cont{{2, 3, 3}, dtype::Int32()}, noncont = cont;
  175. noncont.stride[1] = 5;
  176. noncont.stride[0] = 25;
  177. run(out_cont, cont, noncont);
  178. }
  179. verbose = true;
  180. run_preset({1234, 5678}, 0, false, out_cont);
  181. run_preset({256, 256, 256}, 0, true, out_cont);
  182. run_preset({2, 3, 1024, 1024}, 1, false, out_cont);
  183. run_preset({1025, 2049}, 1, false, out_cont);
  184. run_preset({2049, 1025}, 1, false, out_cont);
  185. run_preset({10, 1024, 1024}, 2, false, out_cont);
  186. {
  187. // padding-like
  188. TensorLayout cont{{60, 60, 60}, dtype::Int32()}, noncont = cont;
  189. noncont.stride[1] = 63;
  190. noncont.stride[0] = 63 * 63;
  191. run(out_cont, cont, noncont);
  192. }
  193. }
  194. }
  195. #endif
  196. // vim: syntax=cpp.doxygen