You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.h 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. /**
  2. * \file dnn/src/fallback/conv_bias/opr_impl.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include "include/megdnn/thin/function.h"
  14. #include "src/common/utils.h"
  15. #include "src/fallback/conv_bias/common.h"
  16. #include "src/fallback/convolution/opr_impl.h"
  17. #include "src/fallback/matrix_mul/opr_impl.h"
  18. #include "src/naive/conv_bias/opr_impl.h"
  19. namespace megdnn {
  20. namespace fallback {
  21. /*!
  22. * \brief get the pack_size according to the format
  23. * Note TODO: when remove format from param,
  24. * may using like this "opr::param::format specify"
  25. * */
  26. size_t pack_size(param::ConvBias::Format format);
  27. /*!
  28. * \brief fallback conv bias forward impl
  29. *
  30. * Note: this operator class serves for multiple purposes:
  31. *
  32. * 1. canonizing conv reprs into NCBKernParam and NCBKernSizeParam, and
  33. * subclasses should impl by overriding *_ncb methods
  34. * 2. providing a default impl for group conv by calling ncb_1g* methods
  35. * 3. providing a conv impl faster than naive under some cases
  36. * 4. providing a default impl for choosing heuristic algorithm, by using the
  37. * first algo that fits the workspace limit
  38. */
  39. class ConvBiasImpl : public naive::ConvBiasForwardImpl {
  40. public:
  41. using naive::ConvBiasForwardImpl::ConvBiasForwardImpl;
  42. using AlgoSelectionStrategy = detail::AlgoSelectionStrategy;
  43. //! implemented by exec_with_ncb_kern()
  44. void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
  45. _megdnn_tensor_in bias, _megdnn_tensor_in z,
  46. _megdnn_tensor_out dst, const PreprocessedFilter*,
  47. _megdnn_workspace workspace) override;
  48. //! implemented by get_workspace_with_ncb()
  49. size_t get_workspace_in_bytes(const TensorLayout& src,
  50. const TensorLayout& filter,
  51. const TensorLayout& bias,
  52. const TensorLayout& z,
  53. const TensorLayout& dst,
  54. const PreprocessedFilter*) override;
  55. //! implemented by get_all_algorithms_with_ncb()
  56. std::vector<Algorithm*> get_all_algorithms(
  57. const TensorLayout& src, const TensorLayout& filter,
  58. const TensorLayout& bias, const TensorLayout& z,
  59. const TensorLayout& dst) override;
  60. //! implemented by get_algorithm_heuristic_with_ncb()
  61. Algorithm* get_algorithm_heuristic(const TensorLayout& src,
  62. const TensorLayout& filter,
  63. const TensorLayout& bias,
  64. const TensorLayout& z,
  65. const TensorLayout& dst,
  66. size_t workspace_limit_in_bytes,
  67. bool reproducible) override;
  68. //! size param for kernels with non-contiguous batch
  69. struct NCBKernSizeParam : ConvolutionImpl::NCBKernSizeParam {
  70. NCBKernSizeParam() = default;
  71. NCBKernSizeParam(const ConvolutionImpl::NCBKernSizeParam& param,
  72. size_t output_block_size,
  73. param::MatrixMul::Format winograd_matmul_format,
  74. DType bias_type, ptrdiff_t bias_bs, BiasMode bias_mode,
  75. Param::NonlineMode nonlineMode)
  76. : ConvolutionImpl::NCBKernSizeParam(param),
  77. output_block_size{output_block_size},
  78. winograd_matmul_format{winograd_matmul_format},
  79. bias_type{bias_type},
  80. bias_bs{bias_bs},
  81. bias_mode{bias_mode},
  82. nonlineMode{nonlineMode} {}
  83. size_t output_block_size; //!< used in winograd algo
  84. param::MatrixMul::Format winograd_matmul_format;
  85. DType bias_type;
  86. //! stride for batch of bias
  87. ptrdiff_t bias_bs;
  88. BiasMode bias_mode;
  89. Param::NonlineMode nonlineMode;
  90. };
  91. //! memory param for kernels with non-contiguous batch
  92. struct NCBKernParam : public NCBKernSizeParam {
  93. NCBKernParam() = default;
  94. const void* src_ptr;
  95. const void* filter_ptr;
  96. const void* bias_ptr;
  97. void* dst_ptr;
  98. void* workspace_ptr;
  99. size_t workspace_size;
  100. template <typename T>
  101. const T* src() const {
  102. src_type.assert_is_compatible_ctype<T>();
  103. return static_cast<const T*>(src_ptr);
  104. }
  105. //! when format is nchwxx, multi channel will pack into one
  106. //! chnannel_pack_id. pack_channel_size is the number of packed channel
  107. //! when format is nchwxx and channel wise, multi group will pack into
  108. //! one group_pack_id. group_pack_size is the number of packed group
  109. //! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8}
  110. template <typename T>
  111. const T* src(size_t batch_id, size_t group_pack_id,
  112. size_t channel_pack_id = 0, size_t group_pack_size = 1,
  113. size_t channel_pack_size = 1) const;
  114. template <typename T>
  115. const T* bias(size_t batch_id, size_t group_pack_id,
  116. size_t channel_pack_id = 0, size_t group_pack_size = 1,
  117. size_t channel_pack_size = 1) const;
  118. template <typename T>
  119. T* dst(size_t batch_id, size_t group_pack_id,
  120. size_t channel_pack_id = 0, size_t group_pack_size = 1,
  121. size_t channel_pack_size = 1) const;
  122. //! when format is nchwxx and channel wise, multi group will pack into
  123. //! one group_pack_id. group_pack_size is the number of packed group
  124. //! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8}
  125. template <typename T>
  126. const T* filter(size_t group_pack_id,
  127. size_t pack_group_size = 1_z) const;
  128. template <typename T>
  129. const T* filter() const {
  130. filter_type.assert_is_compatible_ctype<T>();
  131. return static_cast<const T*>(filter_ptr);
  132. }
  133. template <typename T>
  134. const T* bias() const {
  135. bias_type.assert_is_compatible_ctype<T>();
  136. return static_cast<const T*>(bias_ptr);
  137. }
  138. template <typename T>
  139. T* dst() const {
  140. dst_type.assert_is_compatible_ctype<T>();
  141. return static_cast<T*>(dst_ptr);
  142. }
  143. template <typename T>
  144. T* workspace() const {
  145. return static_cast<T*>(workspace_ptr);
  146. }
  147. };
  148. /**
  149. * \brief Kernel run time id, This information is used for getting the work
  150. * data
  151. */
  152. struct NCBKernIndex {
  153. size_t thread_id = 0; //!< Thread id
  154. CpuNDRange ndrange_id;
  155. };
  156. //! move arm_common to fallback
  157. virtual bool is_matmul_quantized_prefer(
  158. const ConvBiasImpl::NCBKernSizeParam& ncb_param) {
  159. MEGDNN_MARK_USED_VAR(ncb_param);
  160. return true;
  161. };
  162. using ncb_kern_t = thin_function<void(const NCBKernParam& param,
  163. const NCBKernIndex& ncb_index)>;
  164. struct NCBKern {
  165. ncb_kern_t kern; //!< conv kern parallel ptr
  166. CpuNDRange global_size;
  167. };
  168. class AlgoBase : public Algorithm {
  169. public:
  170. virtual ~AlgoBase() = default;
  171. virtual bool usable(
  172. ConvBiasImpl* opr, const NCBKernSizeParam& param,
  173. AlgoSelectionStrategy algo_selection_strategy) const = 0;
  174. virtual size_t get_workspace(ConvBiasImpl* opr,
  175. const NCBKernSizeParam& param) const = 0;
  176. virtual SmallVector<NCBKern> dispatch_kerns(
  177. ConvBiasImpl* opr, const NCBKernSizeParam& param) const = 0;
  178. //! Temporarily used to identify whether the matmul algorithm is
  179. //! is_preferred.
  180. virtual bool is_preferred(ConvBiasImpl*,
  181. const NCBKernSizeParam&) const {
  182. return false;
  183. }
  184. bool usable_reproducible(ConvBiasImpl* opr,
  185. const NCBKernSizeParam& param,
  186. AlgoSelectionStrategy algo_selection_strategy,
  187. bool reproducible = true) const {
  188. return (!reproducible || is_reproducible()) &&
  189. usable(opr, param, algo_selection_strategy);
  190. }
  191. };
  192. /**
  193. * \brief get all the algorithm for the opr.
  194. */
  195. virtual SmallVector<AlgoBase*> algo_pack();
  196. protected:
  197. //! default impl calls ncb_algo_dispatch_kern()
  198. virtual void exec_with_ncb_kern(const NCBKernParam& param,
  199. ConvBiasImpl::Algorithm* algo);
  200. //! default impl calls ncb_algo_get_all_algorithms()
  201. virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
  202. const NCBKernSizeParam& param);
  203. //! default impl calls ncb_algo_get_algorithm_heuristic()
  204. virtual Algorithm* get_algorithm_heuristic_with_ncb(
  205. const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
  206. bool reproducible = false);
  207. /**
  208. * \brief get kernel pointer for non-contiguous batch kernel or
  209. * simply conv bias kernel.
  210. *
  211. * whether the kernel processing batch 1-group is decided by the
  212. * algo.
  213. */
  214. virtual SmallVector<NCBKern> ncb_algo_dispatch_kerns(
  215. Algorithm* algo, const NCBKernSizeParam& param);
  216. virtual size_t ncb_algo_get_workspace(Algorithm* algo,
  217. const NCBKernSizeParam& param);
  218. /*!
  219. * the default impl iterates over all ncb_algo_get_all_algorithms()
  220. * and return the first one whose workspace does not exceed the limit.
  221. */
  222. virtual Algorithm* ncb_algo_get_algorithm_heuristic(
  223. const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
  224. bool reproducible = false);
  225. const char* get_algorithm_set_name() const override;
  226. private:
  227. class AlgoNaive;
  228. class AlgoIm2col;
  229. class AlgoConv1x1;
  230. class AlgoWinogradF32;
  231. class AlgoWinogradF32_4x4;
  232. class AlgoWinogradQS8;
  233. class AlgoWinogradQS8_8x8;
  234. class AlgoPack;
  235. NCBKernSizeParam m_prev_selected_algo_sizep;
  236. Algorithm* m_prev_selected_algo = nullptr;
  237. bool is_naive_algo(ConvBiasImpl::Algorithm* algo);
  238. //! get algorithm set by user or by heuristic
  239. Algorithm* get_algorithm(
  240. const NCBKernSizeParam& param,
  241. size_t workspace_size = std::numeric_limits<size_t>::max());
  242. NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& src,
  243. const TensorLayout& filter,
  244. const TensorLayout& bias,
  245. const TensorLayout& dst);
  246. NCBKernParam make_ncb_kern_param(_megdnn_tensor_in src,
  247. _megdnn_tensor_in filter,
  248. _megdnn_tensor_in bias,
  249. _megdnn_tensor_out dst,
  250. _megdnn_workspace workspace);
  251. };
  252. } // namespace fallback
  253. } // namespace megdnn
  254. //! unpack NCBKernSizeParam into local variables (N, IC, IH, IW, ...)
  255. #define UNPACK_CONV_NCB_KERN_SIZES(_p) \
  256. auto N = _p.n, IC = _p.filter_meta.icpg, IH = _p.isz[0], IW = _p.isz[1], \
  257. OC = _p.filter_meta.ocpg, OH = _p.osz[0], OW = _p.osz[1], \
  258. FH = _p.filter_meta.spatial[0], FW = _p.filter_meta.spatial[1], \
  259. SH = _p.filter_meta.stride[0], SW = _p.filter_meta.stride[1], \
  260. PH = _p.filter_meta.padding[0], PW = _p.filter_meta.padding[1]
  261. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台