You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.h 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. /**
  2. * \file dnn/src/fallback/convolution/opr_impl.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #pragma once
  12. #include "src/common/utils.h"
  13. #include "src/fallback/handle.h"
  14. #include "src/naive/convolution/opr_impl.h"
  15. namespace megdnn {
  16. namespace fallback {
  17. /*!
  18. * \brief fallback convolution forward impl
  19. *
  20. * Note: this operator class serves for multiple purposes:
  21. *
  22. * 1. canonizing conv reprs into NCBKernParam and NCBKernSizeParam, and
  23. * subclasses should impl by overriding *_ncb methods
  24. * 2. providing a default impl for group conv by calling ncb_1g* methods
  25. * 3. providing a conv impl faster than naive under some cases
  26. * 4. providing a default impl for choosing heuristic algorithm, by using the
  27. * first algo that fits the workspace limit
  28. */
  29. class ConvolutionImpl : public naive::ConvolutionForwardImpl {
  30. public:
  31. using naive::ConvolutionForwardImpl::ConvolutionForwardImpl;
  32. using AlgoSelectionStrategy = detail::AlgoSelectionStrategy;
  33. //! implemented by exec_with_ncb_kern()
  34. void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
  35. _megdnn_tensor_out dst, const PreprocessedFilter*,
  36. _megdnn_workspace workspace) override;
  37. void exec_preprocess(const TensorLayout& src_layout,
  38. _megdnn_tensor_in filter,
  39. const TensorLayout& dst_layout,
  40. PreprocessedFilter* preprocessed_filter,
  41. _megdnn_workspace workspace) override;
  42. //! implemented by get_workspace_with_ncb()
  43. size_t get_workspace_in_bytes(const TensorLayout& src,
  44. const TensorLayout& filter,
  45. const TensorLayout& dst,
  46. const PreprocessedFilter*) override;
  47. SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
  48. const TensorLayout& src, const TensorLayout& filter,
  49. const TensorLayout& dst) override;
  50. size_t get_preprocess_workspace_in_bytes(const TensorLayout& src,
  51. const TensorLayout& filter,
  52. const TensorLayout& dst) override;
  53. //! implemented by get_all_algorithms_with_ncb()
  54. std::vector<Algorithm*> get_all_algorithms(
  55. const TensorLayout& src, const TensorLayout& filter,
  56. const TensorLayout& dst) override;
  57. //! implemented by get_algorithm_heuristic_with_ncb()
  58. Algorithm* get_algorithm_heuristic(const TensorLayout& src,
  59. const TensorLayout& filter,
  60. const TensorLayout& dst,
  61. size_t workspace_limit_in_bytes,
  62. bool reproducible) override;
  63. //! size param for kernels with non-contiguous batch
  64. struct NCBKernSizeParam {
  65. uint32_t n;
  66. std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
  67. //! filter info; group is guaranteed to be 1
  68. CanonizedFilterMeta filter_meta;
  69. DType src_type, filter_type, dst_type;
  70. //! stride for batch of input, output
  71. ptrdiff_t inp_bs, out_bs;
  72. //! stride for each dim of input, output
  73. ptrdiff_t inp_s[4], out_s[4];
  74. Param::ComputeMode compute_mode;
  75. size_t nr_threads;
  76. //! weight_preprocess info
  77. const PreprocessedFilter* preprocessed_filter;
  78. };
  79. //! memory param for kernels with non-contiguous batch
  80. struct NCBKernParam : public NCBKernSizeParam {
  81. const void* src_ptr;
  82. const void* filter_ptr;
  83. void* dst_ptr;
  84. void* workspace_ptr;
  85. size_t workspace_size;
  86. template <typename T>
  87. const T* src() const {
  88. src_type.assert_is_compatible_ctype<T>();
  89. return static_cast<const T*>(src_ptr);
  90. }
  91. template <typename T>
  92. const T* filter() const {
  93. filter_type.assert_is_compatible_ctype<T>();
  94. return static_cast<const T*>(filter_ptr);
  95. }
  96. template <typename T>
  97. T* dst() const {
  98. dst_type.assert_is_compatible_ctype<T>();
  99. return static_cast<T*>(dst_ptr);
  100. }
  101. template <typename T>
  102. T* workspace() const {
  103. return static_cast<T*>(workspace_ptr);
  104. }
  105. //! when format is nchwxx and channel wise, multi group will pack into
  106. //! one group_pack_id. group_pack_size is the number of packed group
  107. //! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8}
  108. template <typename T>
  109. T* dst(size_t batch_id, size_t group_pack_id,
  110. size_t group_pack_size = 1_z) const{
  111. size_t batch_offset = batch_id * out_bs * dst_type.size();
  112. size_t group_offset = group_pack_size * group_pack_id *
  113. filter_meta.ocpg * osz[0] * osz[1] *
  114. dst_type.size();
  115. return reinterpret_cast<T*>(reinterpret_cast<ptrdiff_t>(dst_ptr) +
  116. batch_offset + group_offset);
  117. }
  118. template <typename T>
  119. const T* src(size_t batch_id, size_t group_pack_id,
  120. size_t group_pack_size = 1_z) const {
  121. size_t batch_offset = batch_id * inp_bs * src_type.size();
  122. size_t group_offset = group_pack_size * group_pack_id *
  123. filter_meta.icpg * isz[0] * isz[1] *
  124. src_type.size();
  125. return reinterpret_cast<T*>(reinterpret_cast<ptrdiff_t>(src_ptr) +
  126. batch_offset + group_offset);
  127. }
  128. template <typename T>
  129. const T* filter(size_t group_pack_id,
  130. size_t pack_group_size = 1_z) const {
  131. size_t group_offset = pack_group_size * group_pack_id *
  132. filter_meta.icpg * filter_meta.ocpg *
  133. filter_meta.spatial[0] *
  134. filter_meta.spatial[1] * filter_type.size();
  135. return reinterpret_cast<T*>(
  136. reinterpret_cast<ptrdiff_t>(filter_ptr) + group_offset);
  137. }
  138. };
  139. static void* const sm_fallback_conv_algo_type;
  140. /**
  141. * \brief Kernel run time id, This information is used for getting the
  142. * work data
  143. */
  144. struct NCBKernIndex {
  145. size_t thread_id = 0; //!< Thread id
  146. CpuNDRange ndrange_id;
  147. };
  148. using ncb_kern_t = thin_function<void(const NCBKernParam& param,
  149. const NCBKernIndex& ncb_index)>;
  150. struct NCBKern {
  151. ncb_kern_t kern; //!< conv kern parallel ptr
  152. CpuNDRange global_size;
  153. };
  154. class AlgoBase : public Algorithm {
  155. public:
  156. virtual ~AlgoBase() = default;
  157. virtual bool usable(const NCBKernSizeParam& param,
  158. AlgoSelectionStrategy) const = 0;
  159. virtual size_t get_workspace(const NCBKernSizeParam& param) const = 0;
  160. virtual SmallVector<NCBKern> dispatch_kern(
  161. const NCBKernSizeParam& param) const = 0;
  162. virtual SmallVector<NCBKern> dispatch_preprocess_kern(
  163. const NCBKernSizeParam&) const {
  164. return {};
  165. };
  166. //! get the layouts of weight_prerocess dst
  167. virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
  168. const NCBKernSizeParam&) const {
  169. return {};
  170. };
  171. //! get the workspace when weight_prerocess
  172. virtual size_t get_preprocess_workspace(const NCBKernSizeParam&) const {
  173. return 0_z;
  174. };
  175. //! Temporarily used to identify whether the matmul algorithm is
  176. //! is_preferred.
  177. virtual bool is_preferred(const NCBKernSizeParam&) const {
  178. return false;
  179. }
  180. bool usable_reproducible(const NCBKernSizeParam& param,
  181. AlgoSelectionStrategy algo_selection_strategy,
  182. bool reproducible = true) const {
  183. return (!reproducible || is_reproducible()) &&
  184. usable(param, algo_selection_strategy);
  185. }
  186. };
  187. /**
  188. * \brief get all the algorithm for the opr.
  189. */
  190. virtual SmallVector<AlgoBase*> algo_pack();
  191. protected:
  192. virtual void exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo);
  193. virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param,
  194. Algorithm* algo);
  195. virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
  196. const NCBKernSizeParam& param);
  197. virtual Algorithm* get_algorithm_heuristic_with_ncb(
  198. const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
  199. bool reproducible = false);
  200. const char* get_algorithm_set_name() const override;
  201. class AlgoFallback;
  202. class AlgoNaive;
  203. class AlgoDefault;
  204. class AlgoPack;
  205. private:
  206. NCBKernSizeParam m_prev_selected_algo_sizep;
  207. Algorithm* m_prev_selected_algo = nullptr;
  208. bool is_naive_algo(ConvolutionImpl::Algorithm* algo);
  209. //! get algorithm set by user or by heuristic
  210. Algorithm* get_algorithm(
  211. const NCBKernSizeParam& param,
  212. size_t workspace_size = std::numeric_limits<size_t>::max());
  213. NCBKernSizeParam make_ncb_kern_size_param(
  214. const TensorLayout& src, const TensorLayout& filter,
  215. const TensorLayout& dst,
  216. const PreprocessedFilter* preprocessed_filter);
  217. NCBKernParam make_ncb_kern_param(
  218. _megdnn_tensor_in src, _megdnn_tensor_in filter,
  219. _megdnn_tensor_out dst,
  220. const PreprocessedFilter* preprocessed_filter,
  221. _megdnn_workspace workspace);
  222. };
  223. class ConvolutionBackwardDataImpl : public naive::ConvolutionBackwardDataImpl {
  224. public:
  225. using naive::ConvolutionBackwardDataImpl::ConvolutionBackwardDataImpl;
  226. void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff,
  227. _megdnn_tensor_out grad, _megdnn_workspace workspace) override;
  228. size_t get_workspace_in_bytes(const TensorLayout& flter,
  229. const TensorLayout& diff,
  230. const TensorLayout& grad) override;
  231. std::vector<Algorithm*> get_all_algorithms(
  232. const TensorLayout& filter, const TensorLayout& diff,
  233. const TensorLayout& grad) override;
  234. Algorithm* get_algorithm_heuristic(const TensorLayout& filter,
  235. const TensorLayout& diff,
  236. const TensorLayout& grad,
  237. size_t workspace_limit_in_bytes,
  238. bool reproducible) override;
  239. const char* get_algorithm_set_name() const override;
  240. //! size param for kernels with non-contiguous batch
  241. struct NCBKernSizeParam {
  242. uint32_t n;
  243. std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
  244. //! filter info; group is guaranteed to be 1
  245. CanonizedFilterMeta filter_meta;
  246. DType diff_type, filter_type, grad_type;
  247. TensorLayout diff_layout, filter_layout, grad_layout;
  248. //! stride for batch of input, output
  249. ptrdiff_t inp_bs, out_bs;
  250. //! extra_mem_size (in bytes) memory after the end of the logical
  251. //! memory block is accessible.
  252. //!
  253. //! this allows for eliminating unnecessary memory copies: e.g.
  254. //! if several bytes after the end of the tensor are
  255. //! accessible, some kernel implementations can utilize
  256. //! out-of-bound SIMD memory access, to avoid issuing
  257. //! memcpy instructions.
  258. //!
  259. //! Note that although extra_mem_size bytes are accessible by the
  260. //! kernel implementation, kernel implementation should not have any
  261. //! ``visible'' effect on any unintended memory location.
  262. //! This means reading and writing the same value to some memory
  263. //! location within extra_mem_size is allowed, but writing a
  264. //! different value is not allowed.
  265. size_t diff_extra_mem_size, filter_extra_mem_size, grad_extra_mem_size;
  266. Param::ComputeMode compute_mode;
  267. };
  268. //! memory param for kernels with non-contiguous batch
  269. struct NCBKernParam : public NCBKernSizeParam {
  270. const void* filter_ptr;
  271. const void* diff_ptr;
  272. void* grad_ptr;
  273. void* workspace_ptr;
  274. size_t workspace_size;
  275. template <typename T>
  276. const T* diff() const {
  277. diff_type.assert_is_compatible_ctype<T>();
  278. return static_cast<const T*>(diff_ptr);
  279. }
  280. template <typename T>
  281. const T* filter() const {
  282. filter_type.assert_is_compatible_ctype<T>();
  283. return static_cast<const T*>(filter_ptr);
  284. }
  285. template <typename T>
  286. T* grad() const {
  287. grad_type.assert_is_compatible_ctype<T>();
  288. return static_cast<T*>(grad_ptr);
  289. }
  290. template <typename T>
  291. T* workspace() const {
  292. return static_cast<T*>(workspace_ptr);
  293. }
  294. };
  295. protected:
  296. typedef void (*ncb_kern_t)(const NCBKernParam& param);
  297. //! default impl calls ncb_1g_dispatch_kern()
  298. virtual void exec_with_ncb_kern(const NCBKernParam& param);
  299. //! default impl calls ncb_1g_get_workspace()
  300. virtual size_t get_workspace_with_ncb(const NCBKernSizeParam& param);
  301. //! default impl calls ncb_1g_get_all_algorithms()
  302. virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
  303. const NCBKernSizeParam& param);
  304. //! default impl calls ncb_1g_get_algorithm_heuristic()
  305. virtual Algorithm* get_algorithm_heuristic_with_ncb(
  306. const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
  307. bool reproducible = false);
  308. //! get kernel pointer for float32 non-contiguous batch 1-group kernel
  309. virtual ncb_kern_t ncb_1g_dispatch_kern(Algorithm* algo,
  310. const NCBKernSizeParam& param);
  311. virtual size_t ncb_1g_get_workspace(Algorithm* algo,
  312. const NCBKernSizeParam& param);
  313. virtual std::vector<Algorithm*> ncb_1g_get_all_algorithms(
  314. const NCBKernSizeParam& param);
  315. /*!
  316. * the default impl iterates over all ncb_1g_get_all_algorithms()
  317. * and return the first one whose workspace does not exceed the limit.
  318. */
  319. virtual Algorithm* ncb_1g_get_algorithm_heuristic(
  320. const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
  321. bool reproducible = false);
  322. static void* const sm_fallback_deconv_algo_type;
  323. class AlgoBase : public Algorithm {
  324. protected:
  325. ~AlgoBase() = default;
  326. public:
  327. virtual bool usable(ConvolutionBackwardDataImpl* opr,
  328. const NCBKernSizeParam& param) const = 0;
  329. virtual size_t get_workspace(ConvolutionBackwardDataImpl* opr,
  330. const NCBKernSizeParam& param) const = 0;
  331. virtual ncb_kern_t dispatch_kern(
  332. ConvolutionBackwardDataImpl* opr,
  333. const NCBKernSizeParam& param) const = 0;
  334. bool usable_reproducible(ConvolutionBackwardDataImpl* opr,
  335. const NCBKernSizeParam& param,
  336. bool reproducible = true) const {
  337. return (!reproducible || is_reproducible()) && usable(opr, param);
  338. }
  339. };
  340. static bool is_matrix_mul_preferred(const NCBKernSizeParam& param);
  341. private:
  342. NCBKernSizeParam m_prev_selected_algo_sizep;
  343. Algorithm* m_prev_selected_algo = nullptr;
  344. //! get algorithm set by user or by heuristic
  345. Algorithm* get_algorithm(const NCBKernSizeParam& param);
  346. NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& filter,
  347. const TensorLayout& diff,
  348. const TensorLayout& grad);
  349. NCBKernParam make_ncb_kern_param(_megdnn_tensor_in filter,
  350. _megdnn_tensor_in diff,
  351. _megdnn_tensor_out grad,
  352. _megdnn_workspace workspace);
  353. class AlgoDirect;
  354. class AlgoMatrixMul;
  355. struct AlgoPack;
  356. static AlgoPack sm_algo_pack;
  357. };
  358. } // namespace fallback
  359. } // namespace megdnn
  360. //! unpack NCBKernSizeParam into local variables (N, IC, IH, IW, ...)
  361. #define UNPACK_CONV_F32_NCB_KERN_SIZES(_p) \
  362. auto N = _p.n, IC = _p.filter_meta.icpg, IH = _p.isz[0], IW = _p.isz[1], \
  363. OC = _p.filter_meta.ocpg, OH = _p.osz[0], OW = _p.osz[1], \
  364. FH = _p.filter_meta.spatial[0], FW = _p.filter_meta.spatial[1], \
  365. SH = _p.filter_meta.stride[0], SW = _p.filter_meta.stride[1], \
  366. PH = _p.filter_meta.padding[0], PW = _p.filter_meta.padding[1]
  367. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台