You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

relayout_helper.h 4.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. /**
  2. * \file dnn/src/common/relayout_helper.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include "megdnn/oprs.h"
  14. #include "src/common/utils.h"
  15. #include "midout.h"
  16. MIDOUT_DECL(transpose_fallback)
  17. namespace megdnn {
  18. namespace relayout {
  19. static inline bool is_contig(const TensorLayout& layout) {
  20. return layout.ndim == 1 && layout.stride[0] == 1;
  21. }
  22. //! [b][m][n][c] to [b][n][m][c]
  23. struct TransposeParam {
  24. size_t batch, m, n, c;
  25. };
  26. /**
  27. * \brief whether the relayout can be formulated as TransposeParam
  28. *
  29. * Note that \p src and \p dst should have been processed by
  30. * RelayoutForward::check_layout_and_canonize
  31. */
  32. bool is_transpose(const TensorLayout& src, const TensorLayout& dst,
  33. TransposeParam& p);
  34. namespace transpose_fallback {
  35. #if MEGDNN_X86 || MEGDNN_NAIVE
  36. constexpr size_t BLOCK_LINE_SIZE_BYTES = 64;
  37. #elif MEGDNN_AARCH64 || MEGDNN_ARMV7
  38. constexpr size_t BLOCK_LINE_SIZE_BYTES = 32;
  39. #elif MEGDNN_RISCV64
  40. //! ref U54-MC arch
  41. constexpr size_t BLOCK_LINE_SIZE_BYTES = 64;
  42. #else
  43. #error "unknown megdnn arch"
  44. #endif
  45. /**
  46. * \brief transpose traits
  47. * \tparam T element type
  48. */
  49. template <typename T>
  50. struct transpose_traits {
  51. static constexpr size_t block_size = BLOCK_LINE_SIZE_BYTES / sizeof(T);
  52. };
  53. template <typename T>
  54. void transpose_block_fallback(const T* src, T* dst, const size_t src_stride,
  55. const size_t dst_stride, size_t block_h,
  56. size_t block_w) {
  57. constexpr size_t block_size = transpose_traits<T>::block_size;
  58. T block[block_size][block_size];
  59. for (size_t i = 0; i < block_h; ++i) {
  60. auto src_ptr = src + i * src_stride;
  61. for (size_t j = 0; j < block_w; ++j) {
  62. block[j][i] = src_ptr[j];
  63. }
  64. }
  65. for (size_t i = 0; i < block_w; ++i) {
  66. auto dst_ptr = dst + i * dst_stride;
  67. for (size_t j = 0; j < block_h; ++j) {
  68. dst_ptr[j] = block[i][j];
  69. }
  70. }
  71. }
  72. template <typename T>
  73. void transpose_block(const T* src, T* dst, const size_t src_stride,
  74. const size_t dst_stride, size_t block_h, size_t block_w) {
  75. transpose_block_fallback(src, dst, src_stride, dst_stride, block_h,
  76. block_w);
  77. }
  78. /*!
  79. * \brief transpose a single block whose size is transpose_traits<T>::block_size
  80. *
  81. * This function and transpose_traits can be specialized to implement optimized
  82. * block transpose
  83. */
  84. template <typename T>
  85. void transpose_block(const T* src, T* dst, const size_t src_stride,
  86. const size_t dst_stride) {
  87. constexpr size_t block_size = transpose_traits<T>::block_size;
  88. transpose_block_fallback(src, dst, src_stride, dst_stride, block_size,
  89. block_size);
  90. }
  91. /*!
  92. * \brief transpose contiguous (batch, m, n) to (batch, n, m)
  93. */
  94. template <typename T>
  95. void transpose(size_t batch, size_t m, size_t n, T* src, T* dst) {
  96. auto batch_src = src;
  97. auto batch_dst = dst;
  98. constexpr size_t B = transpose_traits<T>::block_size;
  99. auto work_block = [m, n, &batch_src, &batch_dst](
  100. const size_t i, const size_t j, const size_t h,
  101. const size_t w) {
  102. auto src = batch_src + i * n + j, dst = batch_dst + j * m + i;
  103. MIDOUT_BEGIN(transpose_fallback, midout_iv(0)) {
  104. if (h == B && w == B) {
  105. transpose_block(src, dst, n, m);
  106. } else {
  107. transpose_block(src, dst, n, m, h, w);
  108. }
  109. }
  110. MIDOUT_END();
  111. };
  112. auto work_row = [&work_block, n](size_t i, size_t h) {
  113. size_t j = 0;
  114. for (; j + B <= n; j += B) {
  115. work_block(i, j, h, B);
  116. }
  117. if (j < n) {
  118. work_block(i, j, h, n - j);
  119. }
  120. };
  121. for (size_t b = 0; b < batch; ++b) {
  122. size_t i = 0;
  123. for (; i + B <= m; i += B) {
  124. work_row(i, B);
  125. }
  126. if (i < m) {
  127. work_row(i, m - i);
  128. }
  129. batch_src += m * n;
  130. batch_dst += m * n;
  131. }
  132. }
  133. } // namespace transpose_fallback
  134. } // namespace relayout
  135. } // namespace megdnn
  136. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台