You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algo.h 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. /**
  2. * \file dnn/src/cuda/convolution/backward_data/algo.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include <unordered_map>
  14. #include "src/common/algo_base.h"
  15. #include "src/common/metahelper.h"
  16. #include "src/cuda/convolution/helper.h"
  17. #include "src/cuda/cudnn_wrapper.h"
  18. namespace megdnn {
  19. namespace cuda {
  20. /*!
  21. * \brief base class for convolution algos
  22. *
  23. * All the algo impls should try to support non-contiguous batch dim, for group
  24. * conv execution.
  25. */
  26. class ConvolutionBackwardDataImpl::AlgoBase : public Algorithm {
  27. protected:
  28. ~AlgoBase() = default;
  29. public:
  30. enum class AlgoType : uint32_t {
  31. CUDA_CUDNN,
  32. CUDA_MATMUL,
  33. CUDA_CHANWISE,
  34. CUDA_CHANWISE_SMALL,
  35. CUDA_BFLOAT16,
  36. CUDA_GROUP_CONV_GENERAL,
  37. CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
  38. CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8
  39. };
  40. using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
  41. AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
  42. struct SizeArgs {
  43. HandleImpl* handle;
  44. CanonizedFilterMeta filter_meta;
  45. const TensorLayout *diff_layout, *grad_layout, *filter_layout;
  46. ConvolutionBackwardDataImpl* opr;
  47. std::string to_string() const;
  48. void init_desc(convolution::CUDNNBwdDataDescs& desc) const {
  49. desc.set(filter_meta, *diff_layout, *grad_layout, opr->param());
  50. }
  51. SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter,
  52. const TensorLayout& diff, const TensorLayout& grad);
  53. SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter,
  54. const CanonizedFilterMeta& filter_meta,
  55. const TensorLayout& diff, const TensorLayout& grad);
  56. convolution::ForwardSizeArgs as_fwd_args() const {
  57. return {handle, grad_layout, filter_layout, filter_meta,
  58. diff_layout};
  59. }
  60. };
  61. struct ExecArgs : public SizeArgs {
  62. const TensorND *filter_tensor, *diff_tensor, *grad_tensor;
  63. Workspace workspace;
  64. ExecArgs(ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter,
  65. _megdnn_tensor_in diff, _megdnn_tensor_out grad,
  66. _megdnn_workspace workspace);
  67. };
  68. virtual bool is_available(const SizeArgs& args) const = 0;
  69. virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
  70. virtual void exec(const ExecArgs& args) const = 0;
  71. bool is_available_wk(const SizeArgs& args, size_t limit) {
  72. return is_available(args) && get_workspace_in_bytes(args) <= limit;
  73. }
  74. bool is_available_attribute(
  75. const SizeArgs& args,
  76. const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
  77. const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT,
  78. size_t limit = std::numeric_limits<size_t>::max()) {
  79. return contain_attribute_all(positive_attr) &&
  80. !contain_attribute_any(negative_attr) &&
  81. is_available_wk(args, limit);
  82. }
  83. AlgoBase& check_workspace(const SizeArgs& args,
  84. const Workspace& workspace) {
  85. auto req = get_workspace_in_bytes(args);
  86. megdnn_assert(req <= workspace.size,
  87. "conv bwd data algo %s: "
  88. "required workspace %zu bytes, got %zu",
  89. name(), req, workspace.size);
  90. return *this;
  91. }
  92. virtual bool is_cudnn() const { return false; }
  93. };
  94. class ConvolutionBackwardDataImpl::AlgoCUDNN final : public AlgoBase {
  95. cudnnConvolutionBwdDataAlgo_t m_cudnn_enum;
  96. CudnnAlgoPack::Attr m_attr;
  97. public:
  98. AlgoCUDNN(cudnnConvolutionBwdDataAlgo_t cudnn_enum)
  99. : m_cudnn_enum(cudnn_enum) {
  100. megdnn_assert(CudnnAlgoPack::conv_bwd_data_algos().find(cudnn_enum) !=
  101. CudnnAlgoPack::conv_bwd_data_algos().end());
  102. m_attr = CudnnAlgoPack::conv_bwd_data_algos().at(cudnn_enum);
  103. }
  104. bool is_available(const SizeArgs& args) const override;
  105. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  106. void exec(const ExecArgs& args) const override;
  107. const char* name() const override { return m_attr.name.c_str(); }
  108. AlgoAttribute attribute() const override {
  109. auto ret = static_cast<AlgoAttribute>(0);
  110. if (m_attr.is_reproducible) {
  111. ret |= AlgoAttribute::REPRODUCIBLE;
  112. }
  113. if (m_attr.accuracy_depend_on_batch) {
  114. ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
  115. }
  116. return ret;
  117. }
  118. cudnnConvolutionBwdDataAlgo_t cudnn_enum() const { return m_cudnn_enum; }
  119. bool is_cudnn() const override { return true; }
  120. MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN)
  121. std::string param() const override {
  122. std::string ret;
  123. serialize_write_pod(m_cudnn_enum, ret);
  124. return ret;
  125. }
  126. };
  127. //! im2col and matmul, with dilation
  128. class ConvolutionBackwardDataImpl::AlgoMatmul final : public AlgoBase {
  129. template <typename T>
  130. static void exec_internal(const ExecArgs& args);
  131. public:
  132. bool is_available(const SizeArgs& args) const override;
  133. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  134. void exec(const ExecArgs& args) const override;
  135. std::vector<SearchItem> get_subopr_list(
  136. const TensorLayoutArray& layouts,
  137. const OperatorBase* opr) const override;
  138. const char* name() const override { return "MATMUL"; }
  139. MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
  140. AlgoAttribute attribute() const override {
  141. return AlgoAttribute::REPRODUCIBLE |
  142. AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
  143. }
  144. };
  145. class ConvolutionBackwardDataImpl::AlgoChanwise final : public AlgoBase {
  146. public:
  147. bool is_available(const SizeArgs& args) const override;
  148. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  149. void exec(const ExecArgs& args) const override;
  150. const char* name() const override { return "CHANNEL_WISE"; }
  151. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE)
  152. AlgoAttribute attribute() const override {
  153. return AlgoAttribute::REPRODUCIBLE;
  154. }
  155. };
  156. class ConvolutionBackwardDataImpl::AlgoChanwiseSmall final : public AlgoBase {
  157. public:
  158. bool is_available(const SizeArgs& args) const override;
  159. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  160. void exec(const ExecArgs& args) const override;
  161. const char* name() const override { return "CHANNEL_WISE_SMALL"; }
  162. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_SMALL)
  163. AlgoAttribute attribute() const override {
  164. return AlgoAttribute::REPRODUCIBLE |
  165. AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
  166. }
  167. };
  168. class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase {
  169. public:
  170. bool is_available(const SizeArgs& args) const override;
  171. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  172. void exec(const ExecArgs& args) const override;
  173. std::vector<SearchItem> get_subopr_list(
  174. const TensorLayoutArray& layouts,
  175. const OperatorBase* opr) const override;
  176. const char* name() const override {
  177. return "CONVOLUTION_BACKWARD_DATD_BFLOAT16";
  178. }
  179. AlgoAttribute attribute() const override {
  180. return AlgoAttribute::REPRODUCIBLE;
  181. }
  182. private:
  183. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  184. MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)
  185. };
  186. //! implement group conv by another algo
  187. class ConvolutionBackwardDataImpl::AlgoGroupConvGeneral final
  188. : public AlgoBase {
  189. AlgoBase* m_impl;
  190. std::string m_name;
  191. public:
  192. AlgoGroupConvGeneral(AlgoBase* impl);
  193. bool is_available(const SizeArgs& args) const override;
  194. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  195. void exec(const ExecArgs& args) const override;
  196. const char* name() const override { return m_name.c_str(); }
  197. static void modify_size_args(SizeArgs& args, TensorLayout& diff_pg,
  198. TensorLayout& grad_pg);
  199. MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)
  200. AlgoAttribute attribute() const override {
  201. auto ret = AlgoAttribute::DEFAULT;
  202. #define cb(attr) \
  203. if (m_impl->contain_attribute_all(attr)) { \
  204. ret |= attr; \
  205. }
  206. MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb)
  207. #undef cb
  208. if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) {
  209. ret |= AlgoAttribute::REPRODUCIBLE;
  210. }
  211. return ret;
  212. }
  213. };
  214. class ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm final
  215. : public AlgoBase {
  216. public:
  217. struct AlgoParam {
  218. int threadblock_m;
  219. int threadblock_n;
  220. int threadblock_k;
  221. int warp_m;
  222. int warp_n;
  223. int warp_k;
  224. int stage;
  225. std::string to_string() {
  226. return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m,
  227. threadblock_n, threadblock_k, warp_m, warp_n,
  228. warp_k, stage);
  229. }
  230. };
  231. AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param)
  232. : m_algo_param{algo_param},
  233. m_name{ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
  234. m_algo_param.to_string().c_str())} {}
  235. bool is_available(const SizeArgs& args) const override;
  236. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  237. void exec(const ExecArgs& args) const override;
  238. const char* name() const override { return m_name.c_str(); }
  239. AlgoAttribute attribute() const override {
  240. return AlgoAttribute::REPRODUCIBLE;
  241. }
  242. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)
  243. private:
  244. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  245. const SizeArgs& args) const;
  246. AlgoParam m_algo_param;
  247. std::string m_name;
  248. };
  249. class ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm final
  250. : public AlgoBase {
  251. public:
  252. bool is_available(const SizeArgs& args) const override;
  253. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  254. void exec(const ExecArgs& args) const override;
  255. const char* name() const override {
  256. return "INT8_NCHW_DOTPROD_IMPLICIT_GEMM";
  257. }
  258. AlgoAttribute attribute() const override {
  259. return AlgoAttribute::REPRODUCIBLE;
  260. }
  261. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8);
  262. private:
  263. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  264. const SizeArgs& args) const;
  265. };
  266. class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {
  267. // defined in cudnn.cpp
  268. void fill_cudnn_algos();
  269. // defined in implicit_gemm_int8_nchw4_dp4a.cpp
  270. void fill_int8_dp4a_algos();
  271. AlgoBase::Mapper m_all_algos_map;
  272. public:
  273. AlgoPack();
  274. std::vector<AlgoCUDNN> cudnn;
  275. AlgoMatmul matmul;
  276. AlgoChanwise chanwise;
  277. AlgoChanwiseSmall chanwise_small;
  278. std::vector<AlgoGroupConvGeneral> gconv;
  279. std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv;
  280. AlgoBFloat16 bfloat16;
  281. std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
  282. AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod;
  283. std::vector<AlgoBase*>
  284. //! all algorithms
  285. all_algos,
  286. //! non-cudnn algos, used for heuristic if cudnn is not supported
  287. non_cudnn_algos, bfloat16_algos, int8_algos;
  288. AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdDataAlgo_t algo);
  289. const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
  290. };
  291. } // namespace cuda
  292. } // namespace megdnn
  293. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台