You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algo.h 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. /**
  2. * \file dnn/src/cuda/convolution/backward_data/algo.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include <unordered_map>
  14. #include "src/common/algo_base.h"
  15. #include "src/common/metahelper.h"
  16. #include "src/cuda/convolution/helper.h"
  17. #include "src/cuda/cudnn_wrapper.h"
  18. namespace megdnn {
  19. namespace cuda {
  20. /*!
  21. * \brief base class for convolution algos
  22. *
  23. * All the algo impls should try to support non-contiguous batch dim, for group
  24. * conv execution.
  25. */
  26. class ConvolutionBackwardDataImpl::AlgoBase : public Algorithm {
  27. protected:
  28. ~AlgoBase() = default;
  29. public:
  30. enum class AlgoType : uint32_t {
  31. CUDA_CUDNN,
  32. CUDA_MATMUL,
  33. CUDA_CHANWISE,
  34. CUDA_CHANWISE_SMALL,
  35. CUDA_BFLOAT16,
  36. CUDA_GROUP_CONV_GENERAL,
  37. CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
  38. CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8
  39. };
  40. using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
  41. AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
  42. struct SizeArgs {
  43. HandleImpl* handle;
  44. CanonizedFilterMeta filter_meta;
  45. const TensorLayout *diff_layout, *grad_layout, *filter_layout;
  46. const ConvolutionBackwardDataImpl* opr;
  47. std::string to_string() const;
  48. void init_desc(convolution::CUDNNBwdDataDescs& desc) const {
  49. desc.set(filter_meta, *diff_layout, *grad_layout, opr->param());
  50. }
  51. SizeArgs(const ConvolutionBackwardDataImpl* opr,
  52. const TensorLayout& filter, const TensorLayout& diff,
  53. const TensorLayout& grad);
  54. SizeArgs(const ConvolutionBackwardDataImpl* opr,
  55. const TensorLayout& filter,
  56. const CanonizedFilterMeta& filter_meta,
  57. const TensorLayout& diff, const TensorLayout& grad);
  58. convolution::ForwardSizeArgs as_fwd_args() const {
  59. return {handle, grad_layout, filter_layout, filter_meta,
  60. diff_layout};
  61. }
  62. };
  63. struct ExecArgs : public SizeArgs {
  64. const TensorND *filter_tensor, *diff_tensor, *grad_tensor;
  65. Workspace workspace;
  66. ExecArgs(const ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter,
  67. _megdnn_tensor_in diff, _megdnn_tensor_out grad,
  68. _megdnn_workspace workspace);
  69. };
  70. virtual bool is_available(const SizeArgs& args) const = 0;
  71. virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
  72. virtual void exec(const ExecArgs& args) const = 0;
  73. bool is_available_wk(const SizeArgs& args, size_t limit) {
  74. return is_available(args) && get_workspace_in_bytes(args) <= limit;
  75. }
  76. bool is_available_attribute(
  77. const SizeArgs& args,
  78. const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
  79. const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT,
  80. size_t limit = std::numeric_limits<size_t>::max()) {
  81. return contain_attribute_all(positive_attr) &&
  82. !contain_attribute_any(negative_attr) &&
  83. is_available_wk(args, limit);
  84. }
  85. AlgoBase& check_workspace(const SizeArgs& args,
  86. const Workspace& workspace) {
  87. auto req = get_workspace_in_bytes(args);
  88. megdnn_assert(req <= workspace.size,
  89. "conv bwd data algo %s: "
  90. "required workspace %zu bytes, got %zu",
  91. name(), req, workspace.size);
  92. return *this;
  93. }
  94. virtual bool is_cudnn() const { return false; }
  95. };
  96. class ConvolutionBackwardDataImpl::AlgoCUDNN final : public AlgoBase {
  97. cudnnConvolutionBwdDataAlgo_t m_cudnn_enum;
  98. CudnnAlgoPack::Attr m_attr;
  99. public:
  100. AlgoCUDNN(cudnnConvolutionBwdDataAlgo_t cudnn_enum)
  101. : m_cudnn_enum(cudnn_enum) {
  102. megdnn_assert(CudnnAlgoPack::conv_bwd_data_algos().find(cudnn_enum) !=
  103. CudnnAlgoPack::conv_bwd_data_algos().end());
  104. m_attr = CudnnAlgoPack::conv_bwd_data_algos().at(cudnn_enum);
  105. }
  106. bool is_available(const SizeArgs& args) const override;
  107. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  108. void exec(const ExecArgs& args) const override;
  109. const char* name() const override { return m_attr.name.c_str(); }
  110. AlgoAttribute attribute() const override {
  111. auto ret = static_cast<AlgoAttribute>(0);
  112. if (m_attr.is_reproducible) {
  113. ret |= AlgoAttribute::REPRODUCIBLE;
  114. }
  115. if (m_attr.accuracy_depend_on_batch) {
  116. ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
  117. }
  118. return ret;
  119. }
  120. cudnnConvolutionBwdDataAlgo_t cudnn_enum() const { return m_cudnn_enum; }
  121. bool is_cudnn() const override { return true; }
  122. MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN)
  123. std::string param() const override {
  124. std::string ret;
  125. serialize_write_pod(m_cudnn_enum, ret);
  126. return ret;
  127. }
  128. };
  129. //! im2col and matmul, with dilation
  130. class ConvolutionBackwardDataImpl::AlgoMatmul final : public AlgoBase {
  131. template <typename T>
  132. static void exec_internal(const ExecArgs& args);
  133. public:
  134. bool is_available(const SizeArgs& args) const override;
  135. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  136. void exec(const ExecArgs& args) const override;
  137. std::vector<SearchItem> get_subopr_list(
  138. const TensorLayoutArray& layouts,
  139. const OperatorBase* opr) const override;
  140. const char* name() const override { return "MATMUL"; }
  141. MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
  142. AlgoAttribute attribute() const override {
  143. return AlgoAttribute::REPRODUCIBLE |
  144. AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
  145. }
  146. };
  147. class ConvolutionBackwardDataImpl::AlgoChanwise final : public AlgoBase {
  148. public:
  149. bool is_available(const SizeArgs& args) const override;
  150. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  151. void exec(const ExecArgs& args) const override;
  152. const char* name() const override { return "CHANNEL_WISE"; }
  153. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE)
  154. AlgoAttribute attribute() const override {
  155. return AlgoAttribute::REPRODUCIBLE;
  156. }
  157. };
  158. class ConvolutionBackwardDataImpl::AlgoChanwiseSmall final : public AlgoBase {
  159. public:
  160. bool is_available(const SizeArgs& args) const override;
  161. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  162. void exec(const ExecArgs& args) const override;
  163. const char* name() const override { return "CHANNEL_WISE_SMALL"; }
  164. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_SMALL)
  165. AlgoAttribute attribute() const override {
  166. return AlgoAttribute::REPRODUCIBLE |
  167. AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
  168. }
  169. };
  170. class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase {
  171. public:
  172. bool is_available(const SizeArgs& args) const override;
  173. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  174. void exec(const ExecArgs& args) const override;
  175. std::vector<SearchItem> get_subopr_list(
  176. const TensorLayoutArray& layouts,
  177. const OperatorBase* opr) const override;
  178. const char* name() const override {
  179. return "CONVOLUTION_BACKWARD_DATD_BFLOAT16";
  180. }
  181. AlgoAttribute attribute() const override {
  182. return AlgoAttribute::REPRODUCIBLE;
  183. }
  184. private:
  185. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  186. MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)
  187. };
  188. //! implement group conv by another algo
  189. class ConvolutionBackwardDataImpl::AlgoGroupConvGeneral final
  190. : public AlgoBase {
  191. public:
  192. bool is_available(const SizeArgs& args) const override;
  193. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  194. void exec(const ExecArgs& args) const override;
  195. std::vector<SearchItem> get_subopr_list(
  196. const TensorLayoutArray& layouts,
  197. const OperatorBase* opr) const override;
  198. const char* name() const override {
  199. return "CUDA:GROUP_CONV_BACKWARD_DATA";
  200. }
  201. MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)
  202. AlgoAttribute attribute() const override {
  203. return AlgoAttribute::REPRODUCIBLE;
  204. }
  205. private:
  206. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  207. };
  208. class ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm final
  209. : public AlgoBase {
  210. public:
  211. struct AlgoParam {
  212. int threadblock_m;
  213. int threadblock_n;
  214. int threadblock_k;
  215. int warp_m;
  216. int warp_n;
  217. int warp_k;
  218. int stage;
  219. std::string to_string() {
  220. return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m,
  221. threadblock_n, threadblock_k, warp_m, warp_n,
  222. warp_k, stage);
  223. }
  224. };
  225. AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param)
  226. : m_algo_param{algo_param},
  227. m_name{ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
  228. m_algo_param.to_string().c_str())} {}
  229. bool is_available(const SizeArgs& args) const override;
  230. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  231. void exec(const ExecArgs& args) const override;
  232. const char* name() const override { return m_name.c_str(); }
  233. AlgoAttribute attribute() const override {
  234. return AlgoAttribute::REPRODUCIBLE;
  235. }
  236. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)
  237. private:
  238. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  239. const SizeArgs& args) const;
  240. const void* get_available_op(const SizeArgs& args) const;
  241. AlgoParam m_algo_param;
  242. std::string m_name;
  243. };
  244. class ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm final
  245. : public AlgoBase {
  246. public:
  247. bool is_available(const SizeArgs& args) const override;
  248. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  249. void exec(const ExecArgs& args) const override;
  250. const char* name() const override {
  251. return "INT8_NCHW_DOTPROD_IMPLICIT_GEMM";
  252. }
  253. AlgoAttribute attribute() const override {
  254. return AlgoAttribute::REPRODUCIBLE;
  255. }
  256. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8);
  257. private:
  258. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  259. const SizeArgs& args) const;
  260. const void* get_available_op(const SizeArgs& args) const;
  261. };
  262. class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {
  263. // defined in cudnn.cpp
  264. void fill_cudnn_algos();
  265. // defined in implicit_gemm_int8_nchw4_dp4a.cpp
  266. void fill_int8_dp4a_algos();
  267. AlgoBase::Mapper m_all_algos_map;
  268. public:
  269. AlgoPack();
  270. std::vector<AlgoCUDNN> cudnn;
  271. AlgoMatmul matmul;
  272. AlgoChanwise chanwise;
  273. AlgoChanwiseSmall chanwise_small;
  274. AlgoBFloat16 bfloat16;
  275. AlgoGroupConvGeneral group;
  276. std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
  277. AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod;
  278. std::vector<AlgoBase*>
  279. //! all algorithms
  280. all_algos,
  281. //! non-cudnn algos, used for heuristic if cudnn is not supported
  282. non_cudnn_algos, bfloat16_algos, int8_algos;
  283. AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdDataAlgo_t algo);
  284. const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
  285. };
  286. } // namespace cuda
  287. } // namespace megdnn
  288. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台