You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.h 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. /**
  2. * \file dnn/src/fallback/convolution/opr_impl.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #pragma once
  12. #include <memory>
  13. #include <unordered_map>
  14. #include "megdnn/oprs/base.h"
  15. #include "src/common/utils.h"
  16. #include "src/common/algo_base.h"
  17. #include "src/fallback/handle.h"
  18. #include "src/naive/convolution/opr_impl.h"
  19. namespace megdnn {
  20. /**
  21. * \brief Convolutino algo category
  22. */
  23. enum class AlgoCategory : int32_t {
  24. DIRECT = 0,
  25. IM2COL = 1,
  26. WINOGRAD = 2,
  27. NAIVE = 3,
  28. };
  29. struct ConvAlgoTypePack {
  30. detail::AlgoDataType data_type : 32;
  31. AlgoCategory algo_category : 32;
  32. };
  33. namespace fallback {
  34. /*!
  35. * \brief fallback convolution forward impl
  36. *
  37. * Note: this operator class serves for multiple purposes:
  38. *
  39. * 1. canonizing conv reprs into NCBKernParam and NCBKernSizeParam, and
  40. * subclasses should impl by overriding *_ncb methods
  41. * 2. providing a default impl for group conv by calling ncb_1g* methods
  42. * 3. providing a conv impl faster than naive under some cases
  43. * 4. providing a default impl for choosing heuristic algorithm, by using the
  44. * first algo that fits the workspace limit
  45. */
  46. class ConvolutionImpl : public naive::ConvolutionForwardImpl {
  47. public:
  48. using naive::ConvolutionForwardImpl::ConvolutionForwardImpl;
  49. using AlgoSelectionStrategy = detail::AlgoSelectionStrategy;
  50. using AlgoDataType = detail::AlgoDataType;
  51. //! implemented by exec_with_ncb_kern()
  52. void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
  53. _megdnn_tensor_out dst, const PreprocessedFilter*,
  54. _megdnn_workspace workspace) override;
  55. void exec_preprocess(const TensorLayout& src_layout,
  56. _megdnn_tensor_in filter,
  57. const TensorLayout& dst_layout,
  58. PreprocessedFilter* preprocessed_filter,
  59. _megdnn_workspace workspace) override;
  60. //! implemented by get_workspace_with_ncb()
  61. size_t get_workspace_in_bytes(const TensorLayout& src,
  62. const TensorLayout& filter,
  63. const TensorLayout& dst,
  64. const PreprocessedFilter*) override;
  65. SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
  66. const TensorLayout& src, const TensorLayout& filter,
  67. const TensorLayout& dst) override;
  68. size_t get_preprocess_workspace_in_bytes(const TensorLayout& src,
  69. const TensorLayout& filter,
  70. const TensorLayout& dst) override;
  71. //! implemented by get_all_algorithms_with_ncb()
  72. std::vector<Algorithm*> get_all_algorithms(
  73. const TensorLayout& src, const TensorLayout& filter,
  74. const TensorLayout& dst) override;
  75. std::vector<Algorithm*> get_all_algorithms_safe(
  76. const TensorLayout& src, const TensorLayout& filter,
  77. const TensorLayout& dst) override;
  78. //! implemented by get_algorithm_heuristic_with_ncb()
  79. Algorithm* get_algorithm_heuristic(
  80. const TensorLayout& src, const TensorLayout& filter,
  81. const TensorLayout& dst, size_t workspace_limit_in_bytes,
  82. const AlgoAttribute& positive_attr,
  83. const AlgoAttribute& negative_attr) override;
  84. //! size param for kernels with non-contiguous batch
  85. struct NCBKernSizeParam {
  86. uint32_t n;
  87. std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
  88. //! filter info; group is guaranteed to be 1
  89. CanonizedFilterMeta filter_meta;
  90. DType src_type, filter_type, dst_type;
  91. //! stride for batch of input, output
  92. ptrdiff_t inp_bs, out_bs;
  93. //! stride for each dim of input, output
  94. ptrdiff_t inp_s[4], out_s[4];
  95. Param::ComputeMode compute_mode;
  96. size_t nr_threads;
  97. //! weight_preprocess info
  98. const PreprocessedFilter* preprocessed_filter;
  99. //! get the data type category of the param for select the algo
  100. AlgoDataType deduce_algo_data_type() const;
  101. };
  102. //! memory param for kernels with non-contiguous batch
  103. struct NCBKernParam : public NCBKernSizeParam {
  104. const void* src_ptr;
  105. const void* filter_ptr;
  106. void* dst_ptr;
  107. void* workspace_ptr;
  108. size_t workspace_size;
  109. template <typename T>
  110. const T* src() const {
  111. src_type.assert_is_compatible_ctype<T>();
  112. return static_cast<const T*>(src_ptr);
  113. }
  114. template <typename T>
  115. const T* filter() const {
  116. filter_type.assert_is_compatible_ctype<T>();
  117. return static_cast<const T*>(filter_ptr);
  118. }
  119. template <typename T>
  120. T* dst() const {
  121. dst_type.assert_is_compatible_ctype<T>();
  122. return static_cast<T*>(dst_ptr);
  123. }
  124. template <typename T>
  125. T* workspace() const {
  126. return static_cast<T*>(workspace_ptr);
  127. }
  128. //! when format is nchwxx and channel wise, multi group will pack into
  129. //! one group_pack_id. group_pack_size is the number of packed group
  130. //! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8}
  131. template <typename T>
  132. T* dst(size_t batch_id, size_t group_pack_id,
  133. size_t group_pack_size = 1_z) const{
  134. size_t batch_offset = batch_id * out_bs * dst_type.size();
  135. size_t group_offset = group_pack_size * group_pack_id *
  136. filter_meta.ocpg * osz[0] * osz[1] *
  137. dst_type.size();
  138. return reinterpret_cast<T*>(reinterpret_cast<ptrdiff_t>(dst_ptr) +
  139. batch_offset + group_offset);
  140. }
  141. template <typename T>
  142. const T* src(size_t batch_id, size_t group_pack_id,
  143. size_t group_pack_size = 1_z) const {
  144. size_t batch_offset = batch_id * inp_bs * src_type.size();
  145. size_t group_offset = group_pack_size * group_pack_id *
  146. filter_meta.icpg * isz[0] * isz[1] *
  147. src_type.size();
  148. return reinterpret_cast<T*>(reinterpret_cast<ptrdiff_t>(src_ptr) +
  149. batch_offset + group_offset);
  150. }
  151. template <typename T>
  152. const T* filter(size_t group_pack_id,
  153. size_t pack_group_size = 1_z) const {
  154. size_t group_offset = pack_group_size * group_pack_id *
  155. filter_meta.icpg * filter_meta.ocpg *
  156. filter_meta.spatial[0] *
  157. filter_meta.spatial[1] * filter_type.size();
  158. return reinterpret_cast<T*>(
  159. reinterpret_cast<ptrdiff_t>(filter_ptr) + group_offset);
  160. }
  161. };
  162. /**
  163. * \brief Kernel run time id, This information is used for getting the
  164. * work data
  165. */
  166. struct NCBKernIndex {
  167. size_t thread_id = 0; //!< Thread id
  168. CpuNDRange ndrange_id;
  169. };
  170. using ncb_kern_t = thin_function<void(const NCBKernParam& param,
  171. const NCBKernIndex& ncb_index)>;
  172. struct NCBKern {
  173. ncb_kern_t kern; //!< conv kern parallel ptr
  174. CpuNDRange global_size;
  175. };
  176. class AlgoBase : public Algorithm {
  177. public:
  178. AlgoBase() : Algorithm() {
  179. m_handle_type = Handle::HandleType::FALLBACK;
  180. }
  181. enum class AlgoType : uint32_t {
  182. //! fallback
  183. FB_ALGO = 1 << 0,
  184. FB_NAIVE,
  185. FB_DEFAULT,
  186. };
  187. virtual ~AlgoBase() = default;
  188. virtual bool usable(const NCBKernSizeParam& param,
  189. AlgoSelectionStrategy) const = 0;
  190. virtual size_t get_workspace(const NCBKernSizeParam& param) const = 0;
  191. virtual SmallVector<NCBKern> dispatch_kern(
  192. const NCBKernSizeParam& param) const = 0;
  193. virtual SmallVector<NCBKern> dispatch_preprocess_kern(
  194. const NCBKernSizeParam&) const {
  195. return {};
  196. };
  197. //! get the layouts of weight_prerocess dst
  198. virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
  199. const NCBKernSizeParam&) const {
  200. return {};
  201. };
  202. //! get the workspace when weight_prerocess
  203. virtual size_t get_preprocess_workspace(const NCBKernSizeParam&) const {
  204. return 0_z;
  205. };
  206. //! Temporarily used to identify whether the matmul algorithm is
  207. //! is_preferred.
  208. virtual bool is_preferred(const NCBKernSizeParam&) const {
  209. return false;
  210. }
  211. bool usable_attribute(const NCBKernSizeParam& param,
  212. AlgoSelectionStrategy algo_selection_strategy,
  213. const AlgoAttribute& positive_attr =
  214. AlgoAttribute::REPRODUCIBLE,
  215. const AlgoAttribute& negative_attr =
  216. AlgoAttribute::DEFAULT) const {
  217. return contain_attribute_all(positive_attr) &&
  218. !contain_attribute_any(negative_attr) &&
  219. usable(param, algo_selection_strategy);
  220. }
  221. //! get the type of the algo
  222. virtual ConvAlgoTypePack get_algo_type() const = 0;
  223. using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
  224. };
  225. /**
  226. * \brief get all the algorithm for the opr.
  227. */
  228. virtual SmallVector<AlgoBase*> get_all_packed_algo();
  229. /**
  230. * \brief select algo according to input algo type
  231. */
  232. SmallVector<AlgoBase*> select_algo_type(ConvAlgoTypePack algo_type);
  233. protected:
  234. virtual void exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo);
  235. virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param,
  236. Algorithm* algo);
  237. virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
  238. const NCBKernSizeParam& param);
  239. virtual Algorithm* get_algorithm_heuristic_with_ncb(
  240. const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
  241. const AlgoAttribute& positive_attr,
  242. const AlgoAttribute& negative_attr);
  243. const char* get_algorithm_set_name() const override;
  244. class AlgoFallback;
  245. class AlgoNaive;
  246. class AlgoDefault;
  247. class AlgoPack;
  248. private:
  249. NCBKernSizeParam m_prev_selected_algo_sizep;
  250. Algorithm* m_prev_selected_algo = nullptr;
  251. Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
  252. bool is_naive_algo(ConvolutionImpl::Algorithm* algo);
  253. Algorithm* get_algorithm(
  254. const NCBKernSizeParam& param,
  255. size_t workspace_size = std::numeric_limits<size_t>::max());
  256. NCBKernSizeParam make_ncb_kern_size_param(
  257. const TensorLayout& src, const TensorLayout& filter,
  258. const TensorLayout& dst,
  259. const PreprocessedFilter* preprocessed_filter);
  260. NCBKernParam make_ncb_kern_param(
  261. _megdnn_tensor_in src, _megdnn_tensor_in filter,
  262. _megdnn_tensor_out dst,
  263. const PreprocessedFilter* preprocessed_filter,
  264. _megdnn_workspace workspace);
  265. SmallVector<AlgoCategory> suggest_algo_category_order(
  266. const NCBKernSizeParam& param) const;
  267. public:
  268. static const AlgoPack& algo_pack();
  269. };
  270. class ConvolutionBackwardDataImpl : public naive::ConvolutionBackwardDataImpl {
  271. public:
  272. using naive::ConvolutionBackwardDataImpl::ConvolutionBackwardDataImpl;
  273. void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff,
  274. _megdnn_tensor_out grad, _megdnn_workspace workspace) override;
  275. size_t get_workspace_in_bytes(const TensorLayout& flter,
  276. const TensorLayout& diff,
  277. const TensorLayout& grad) override;
  278. std::vector<Algorithm*> get_all_algorithms(
  279. const TensorLayout& filter, const TensorLayout& diff,
  280. const TensorLayout& grad) override;
  281. std::vector<Algorithm*> get_all_algorithms_safe(
  282. const TensorLayout& filter, const TensorLayout& diff,
  283. const TensorLayout& grad) override;
  284. Algorithm* get_algorithm_heuristic(
  285. const TensorLayout& filter, const TensorLayout& diff,
  286. const TensorLayout& grad, size_t workspace_limit_in_bytes,
  287. const AlgoAttribute& positive_attr,
  288. const AlgoAttribute& negative_attr) override;
  289. const char* get_algorithm_set_name() const override;
  290. //! size param for kernels with non-contiguous batch
  291. struct NCBKernSizeParam {
  292. uint32_t n;
  293. std::array<uint32_t, MAX_SPATIAL_DIM> isz, osz;
  294. //! filter info; group is guaranteed to be 1
  295. CanonizedFilterMeta filter_meta;
  296. DType diff_type, filter_type, grad_type;
  297. TensorLayout diff_layout, filter_layout, grad_layout;
  298. //! stride for batch of input, output
  299. ptrdiff_t inp_bs, out_bs;
  300. //! extra_mem_size (in bytes) memory after the end of the logical
  301. //! memory block is accessible.
  302. //!
  303. //! this allows for eliminating unnecessary memory copies: e.g.
  304. //! if several bytes after the end of the tensor are
  305. //! accessible, some kernel implementations can utilize
  306. //! out-of-bound SIMD memory access, to avoid issuing
  307. //! memcpy instructions.
  308. //!
  309. //! Note that although extra_mem_size bytes are accessible by the
  310. //! kernel implementation, kernel implementation should not have any
  311. //! ``visible'' effect on any unintended memory location.
  312. //! This means reading and writing the same value to some memory
  313. //! location within extra_mem_size is allowed, but writing a
  314. //! different value is not allowed.
  315. size_t diff_extra_mem_size, filter_extra_mem_size, grad_extra_mem_size;
  316. Param::ComputeMode compute_mode;
  317. };
  318. //! memory param for kernels with non-contiguous batch
  319. struct NCBKernParam : public NCBKernSizeParam {
  320. const void* filter_ptr;
  321. const void* diff_ptr;
  322. void* grad_ptr;
  323. void* workspace_ptr;
  324. size_t workspace_size;
  325. template <typename T>
  326. const T* diff() const {
  327. diff_type.assert_is_compatible_ctype<T>();
  328. return static_cast<const T*>(diff_ptr);
  329. }
  330. template <typename T>
  331. const T* filter() const {
  332. filter_type.assert_is_compatible_ctype<T>();
  333. return static_cast<const T*>(filter_ptr);
  334. }
  335. template <typename T>
  336. T* grad() const {
  337. grad_type.assert_is_compatible_ctype<T>();
  338. return static_cast<T*>(grad_ptr);
  339. }
  340. template <typename T>
  341. T* workspace() const {
  342. return static_cast<T*>(workspace_ptr);
  343. }
  344. };
  345. protected:
  346. using ncb_kern_t = thin_function<void(const NCBKernParam& param)>;
  347. class AlgoBase : public Algorithm {
  348. protected:
  349. ~AlgoBase() = default;
  350. public:
  351. AlgoBase() : Algorithm() {
  352. m_handle_type = Handle::HandleType::FALLBACK;
  353. }
  354. enum class AlgoType : uint32_t {
  355. //! fallback
  356. FB_NAIVE = 1 << 0,
  357. FB_DIRECT,
  358. FB_MATMUL,
  359. #if MEGDNN_AARCH64 || MEGDNN_ARMV7
  360. ARM_COMMON_DIRECT_STRD1_DOT_INT8X8X32 = 1 << 8,
  361. ARM_COMMON_DIRECT_STRD2_DOT_INT8X8X32,
  362. ARM_COMMON_DIRECT_STRD1_DOT_QU8,
  363. ARM_COMMON_DIRECT_STRD2_DOT_QU8
  364. #endif
  365. };
  366. virtual bool usable(ConvolutionBackwardDataImpl* opr,
  367. const NCBKernSizeParam& param) const = 0;
  368. virtual size_t get_workspace(ConvolutionBackwardDataImpl* opr,
  369. const NCBKernSizeParam& param) const = 0;
  370. virtual ncb_kern_t dispatch_kern(
  371. ConvolutionBackwardDataImpl* opr,
  372. const NCBKernSizeParam& param) const = 0;
  373. bool usable_attribute(ConvolutionBackwardDataImpl* opr,
  374. const NCBKernSizeParam& param,
  375. const AlgoAttribute& positive_attr =
  376. AlgoAttribute::REPRODUCIBLE,
  377. const AlgoAttribute& negative_attr =
  378. AlgoAttribute::DEFAULT) const {
  379. return contain_attribute_all(positive_attr) &&
  380. !contain_attribute_any(negative_attr) && usable(opr, param);
  381. }
  382. virtual bool is_preferred(const NCBKernSizeParam&) const {
  383. return false;
  384. }
  385. //! if the algo is naive, it will not split by group
  386. virtual bool is_naive() const { return false; }
  387. using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
  388. };
  389. protected:
  390. //! default impl calls ncb_1g_dispatch_kern()
  391. virtual void exec_with_ncb_kern(const NCBKernParam& param);
  392. //! default impl calls ncb_1g_get_workspace()
  393. virtual size_t get_workspace_with_ncb(const NCBKernSizeParam& param);
  394. //! default impl calls ncb_1g_get_all_algorithms()
  395. virtual std::vector<Algorithm*> get_all_algorithms_with_ncb(
  396. const NCBKernSizeParam& param);
  397. //! default impl calls ncb_1g_get_algorithm_heuristic()
  398. virtual Algorithm* get_algorithm_heuristic_with_ncb(
  399. const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
  400. const AlgoAttribute& positive_attr,
  401. const AlgoAttribute& negative_attr);
  402. //! get kernel pointer for float32 non-contiguous batch 1-group kernel
  403. virtual ncb_kern_t ncb_1g_dispatch_kern(Algorithm* algo,
  404. const NCBKernSizeParam& param);
  405. virtual size_t ncb_1g_get_workspace(Algorithm* algo,
  406. const NCBKernSizeParam& param);
  407. virtual std::vector<Algorithm*> ncb_1g_get_all_algorithms(
  408. const NCBKernSizeParam& param);
  409. /*!
  410. * the default impl iterates over all ncb_1g_get_all_algorithms()
  411. * and return the first one whose workspace does not exceed the limit.
  412. */
  413. virtual Algorithm* ncb_1g_get_algorithm_heuristic(
  414. const NCBKernSizeParam& param, size_t workspace_limit_in_bytes,
  415. const AlgoAttribute& positive_attr,
  416. const AlgoAttribute& negative_attr);
  417. static bool is_matrix_mul_preferred(const NCBKernSizeParam& param);
  418. /**
  419. * \brief get all the algorithm for the opr.
  420. */
  421. virtual SmallVector<AlgoBase*> get_all_packed_algo();
  422. private:
  423. NCBKernSizeParam m_prev_selected_algo_sizep;
  424. Algorithm* m_prev_selected_algo = nullptr;
  425. //! get algorithm set by user or by heuristic
  426. Algorithm* get_algorithm(const NCBKernSizeParam& param);
  427. NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& filter,
  428. const TensorLayout& diff,
  429. const TensorLayout& grad);
  430. NCBKernParam make_ncb_kern_param(_megdnn_tensor_in filter,
  431. _megdnn_tensor_in diff,
  432. _megdnn_tensor_out grad,
  433. _megdnn_workspace workspace);
  434. class AlgoNaive;
  435. class AlgoDirect;
  436. class AlgoMatrixMul;
  437. class AlgoPack;
  438. Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
  439. public:
  440. //! maintain all the algos of in the opr of fallback
  441. static const AlgoPack& algo_pack();
  442. };
  443. } // namespace fallback
  444. } // namespace megdnn
  445. //! unpack NCBKernSizeParam into local variables (N, IC, IH, IW, ...)
  446. #define UNPACK_CONV_F32_NCB_KERN_SIZES(_p) \
  447. auto N = _p.n, IC = _p.filter_meta.icpg, IH = _p.isz[0], IW = _p.isz[1], \
  448. OC = _p.filter_meta.ocpg, OH = _p.osz[0], OW = _p.osz[1], \
  449. FH = _p.filter_meta.spatial[0], FW = _p.filter_meta.spatial[1], \
  450. SH = _p.filter_meta.stride[0], SW = _p.filter_meta.stride[1], \
  451. PH = _p.filter_meta.padding[0], PW = _p.filter_meta.padding[1]
  452. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台