You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

param_visitor.cuh 6.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. /**
  2. * \file dnn/src/cuda/relayout/param_visitor.cuh
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/basic_types.h"
  13. #include "src/cuda/int_fastdiv.cuh"
  14. #include "src/cuda/integer_subbyte_utils.cuh"
  15. #include "src/cuda/utils.cuh"
  16. #pragma once
  17. namespace megdnn {
  18. namespace cuda {
  19. #define devfunc __device__ __forceinline__
  20. /*!
  21. * \brief contiguous type
  22. * If the layout is contiguous, then the type is CONTIG_FULL, CONTIG_OTHER
  23. * otherwise.
  24. */
  25. enum ContigType { CONTIG_OTHER, CONTIG_FULL };
  26. /* f{{{ ParamElemVisitor specialization */
  27. /*!
  28. * \brief visitor to access an element in a tensor at given logic index
  29. * \tparam ctype plain element ctype (i.e. ctype in DTypeTrait)
  30. * \tparam contig_mask bit mask for contig of params;
  31. *
  32. * host interface:
  33. * void host_init(
  34. * const TensorND &tensor, int grid_size, int block_size)
  35. *
  36. * device interface:
  37. * void thread_init(uint32_t idx)
  38. * called on thread entrance, with logical indexing; the index
  39. y
  40. * go beyond buffer range
  41. *
  42. * ctype* ptr()
  43. * return buffer pointer; can be used by specialized OpCaller
  44. *
  45. * int offset(uint32_t idx)
  46. * get physical offset from logical index
  47. *
  48. * ctype& at(uint32_t idx)
  49. * ptr()[offset(idx)]
  50. *
  51. */
  52. template <int ndim, typename ctype, ContigType contig_type>
  53. class ParamElemVisitor;
  54. #define PARAM_ELEM_VISITOR_COMMON_DEV \
  55. devfunc ctype* ptr() { return m_ptr; } \
  56. devfunc ctype& at(uint32_t idx) { return m_ptr[offset(idx)]; }
  57. //! specialization for CONTIG_OTHER
  58. template <int ndim, typename ctype>
  59. class ParamElemVisitor<ndim, ctype, CONTIG_OTHER> {
  60. ctype* __restrict m_ptr;
  61. int m_stride[ndim];
  62. //! m_shape_highdim[i] = original_shape[i + 1]
  63. #ifdef _MSC_VER
  64. Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1];
  65. #else
  66. Uint32Fastdiv m_shape_highdim[ndim - 1];
  67. #endif
  68. public:
  69. static const int NDIM = ndim;
  70. void host_init(const TensorND& rv, int grid_size, int block_size);
  71. #if MEGDNN_CC_CUDA
  72. devfunc void thread_init(uint32_t) {}
  73. devfunc void next() {}
  74. devfunc int offset(uint32_t idx) {
  75. int offset = 0;
  76. #pragma unroll
  77. for (int i = ndim - 1; i >= 1; --i) {
  78. Uint32Fastdiv& shp = m_shape_highdim[i - 1];
  79. uint32_t idx_div = idx / shp;
  80. offset += (idx - idx_div * shp.divisor()) * m_stride[i];
  81. idx = idx_div;
  82. }
  83. offset += idx * m_stride[0];
  84. return offset;
  85. }
  86. PARAM_ELEM_VISITOR_COMMON_DEV
  87. #endif
  88. };
  89. //! specialization for CONTIG_FULL
  90. template <int ndim, typename ctype>
  91. class ParamElemVisitor<ndim, ctype, CONTIG_FULL> {
  92. ctype* __restrict m_ptr;
  93. public:
  94. static const int NDIM = ndim;
  95. void host_init(const TensorND& rv, int grid_size, int block_size);
  96. #if MEGDNN_CC_CUDA
  97. devfunc void thread_init(uint32_t) {}
  98. devfunc void next() {}
  99. devfunc int offset(uint32_t idx) { return idx; }
  100. PARAM_ELEM_VISITOR_COMMON_DEV
  101. #endif
  102. };
  103. #undef PARAM_ELEM_VISITOR_COMMON_DEV
  104. template <int ndim>
  105. class ParamElemVisitor<ndim, dt_quint4, CONTIG_OTHER> {
  106. using Storage = uint8_t;
  107. Storage* __restrict m_ptr;
  108. int m_stride[ndim];
  109. int m_shape[ndim];
  110. bool m_is_contiguous;
  111. bool m_is_physical_contiguous;
  112. bool m_is_min_stride_2;
  113. //! m_shape_highdim[i] = original_shape[i + 1]
  114. #ifdef _MSC_VER
  115. Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1];
  116. Uint32Fastdiv m_align_shape_highdim[ndim > 1 ? ndim - 1 : 1];
  117. #else
  118. Uint32Fastdiv m_shape_highdim[ndim];
  119. Uint32Fastdiv m_align_shape_highdim[ndim];
  120. #endif
  121. public:
  122. static const Storage kMask = 0xf;
  123. static const Storage kBits = 4;
  124. static const int NDIM = ndim;
  125. void host_init(const TensorND& rv, int grid_size, int block_size);
  126. #if MEGDNN_CC_CUDA
  127. devfunc void thread_init(uint32_t) {}
  128. devfunc void next() {}
  129. devfunc void get_shape_from_access(uint32_t access_idx,
  130. int (&shape_idx)[ndim]) {
  131. #pragma unroll
  132. for (int i = ndim - 1; i >= 1; --i) {
  133. Uint32Fastdiv& align_shp = m_align_shape_highdim[i - 1];
  134. uint32_t access_idx_div = access_idx / align_shp;
  135. shape_idx[i] = access_idx - access_idx_div * align_shp.divisor();
  136. access_idx = access_idx_div;
  137. }
  138. shape_idx[0] = access_idx;
  139. }
  140. devfunc int offset(uint32_t idx) {
  141. int offset = 0;
  142. #pragma unroll
  143. for (int i = ndim - 1; i >= 1; --i) {
  144. Uint32Fastdiv& shp = m_shape_highdim[i - 1];
  145. uint32_t idx_div = idx / shp;
  146. offset += (idx - idx_div * shp.divisor()) * m_stride[i];
  147. idx = idx_div;
  148. }
  149. offset += idx * m_stride[0];
  150. return offset;
  151. }
  152. devfunc int offset_from_access(uint32_t access_idx) {
  153. int offset = 0;
  154. if (m_is_contiguous) {
  155. offset = access_idx;
  156. } else {
  157. int shape_idx[ndim];
  158. get_shape_from_access(access_idx, shape_idx);
  159. #pragma unroll
  160. for (int i = ndim - 1; i >= 0; --i) {
  161. offset += shape_idx[i] * m_stride[i];
  162. }
  163. }
  164. return offset;
  165. }
  166. devfunc int idx(uint32_t access_idx) {
  167. int idx = 0;
  168. if (m_is_physical_contiguous) {
  169. idx = access_idx;
  170. } else if (!m_is_min_stride_2) {
  171. int shape_idx[ndim];
  172. bool valid = true;
  173. get_shape_from_access(access_idx, shape_idx);
  174. #pragma unroll
  175. for (int i = 0; i < ndim; ++i) {
  176. valid &= (shape_idx[i] < m_shape[i]);
  177. }
  178. for (int i = 0; i < ndim - 1; ++i) {
  179. idx = (idx + shape_idx[i]) * m_shape[i + 1];
  180. }
  181. idx = valid ? idx + shape_idx[ndim - 1] : -1;
  182. } else { // min_stride == 2
  183. idx = ((access_idx & 0x1) == 0) ? ((int)access_idx >> 1) : -1;
  184. }
  185. return idx;
  186. }
  187. devfunc Storage* ptr() { return m_ptr; }
  188. devfunc Storage at(uint32_t idx) {
  189. int offset_ = offset(idx);
  190. int vec_idx = offset_ >> 1;
  191. int lane_idx = offset_ & 0x1;
  192. Storage item = Storage(integer_subbyte::unpack_integer_4bits<false>(
  193. *(Storage*)&m_ptr[vec_idx], lane_idx * 4));
  194. return item;
  195. }
  196. using rwtype = typename elemwise_intl::VectTypeTrait<dt_quint4>::vect_type;
  197. devfunc rwtype make_vector(Storage x, Storage y) {
  198. return elemwise_intl::VectTypeTrait<dt_quint4>::make_vector(x, y);
  199. }
  200. #endif
  201. };
  202. } // namespace cuda
  203. } // namespace megdnn
  204. // vim: ft=cpp syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台