You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algo.h 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. /**
  2. * \file dnn/src/cuda/convolution/backward_data/algo.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include <unordered_map>
  14. #include "src/common/algo_base.h"
  15. #include "src/common/metahelper.h"
  16. #include "src/cuda/convolution/helper.h"
  17. #include "src/cuda/cudnn_wrapper.h"
  18. namespace megdnn {
  19. namespace cuda {
  20. /*!
  21. * \brief base class for convolution algos
  22. *
  23. * All the algo impls should try to support non-contiguous batch dim, for group
  24. * conv execution.
  25. */
  26. class ConvolutionBackwardDataImpl::AlgoBase : public Algorithm {
  27. protected:
  28. ~AlgoBase() = default;
  29. public:
  30. enum class AlgoType : uint32_t {
  31. CUDA_CUDNN,
  32. CUDA_MATMUL,
  33. CUDA_CHANWISE,
  34. CUDA_CHANWISE_SMALL,
  35. CUDA_BFLOAT16,
  36. CUDA_GROUP_CONV_GENERAL,
  37. CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
  38. CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8
  39. };
  40. using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
  41. AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
  42. struct SizeArgs {
  43. HandleImpl* handle;
  44. CanonizedFilterMeta filter_meta;
  45. const TensorLayout *diff_layout, *grad_layout, *filter_layout;
  46. ConvolutionBackwardDataImpl* opr;
  47. std::string to_string() const;
  48. void init_desc(convolution::CUDNNBwdDataDescs& desc) const {
  49. desc.set(filter_meta, *diff_layout, *grad_layout, opr->param());
  50. }
  51. SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter,
  52. const TensorLayout& diff, const TensorLayout& grad);
  53. SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter,
  54. const CanonizedFilterMeta& filter_meta,
  55. const TensorLayout& diff, const TensorLayout& grad);
  56. convolution::ForwardSizeArgs as_fwd_args() const {
  57. return {handle, grad_layout, filter_layout, filter_meta,
  58. diff_layout};
  59. }
  60. };
  61. struct ExecArgs : public SizeArgs {
  62. const TensorND *filter_tensor, *diff_tensor, *grad_tensor;
  63. Workspace workspace;
  64. ExecArgs(ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter,
  65. _megdnn_tensor_in diff, _megdnn_tensor_out grad,
  66. _megdnn_workspace workspace);
  67. };
  68. virtual bool is_available(const SizeArgs& args) const = 0;
  69. virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
  70. virtual void exec(const ExecArgs& args) const = 0;
  71. bool is_available_wk(const SizeArgs& args, size_t limit) {
  72. return is_available(args) && get_workspace_in_bytes(args) <= limit;
  73. }
  74. bool is_available_attribute(
  75. const SizeArgs& args,
  76. const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
  77. const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT,
  78. size_t limit = std::numeric_limits<size_t>::max()) {
  79. return contain_attribute_all(positive_attr) &&
  80. !contain_attribute_any(negative_attr) &&
  81. is_available_wk(args, limit);
  82. }
  83. AlgoBase& check_workspace(const SizeArgs& args,
  84. const Workspace& workspace) {
  85. auto req = get_workspace_in_bytes(args);
  86. megdnn_assert(req <= workspace.size,
  87. "conv bwd data algo %s: "
  88. "required workspace %zu bytes, got %zu",
  89. name(), req, workspace.size);
  90. return *this;
  91. }
  92. virtual bool is_cudnn() const { return false; }
  93. };
  94. class ConvolutionBackwardDataImpl::AlgoCUDNN final : public AlgoBase {
  95. cudnnConvolutionBwdDataAlgo_t m_cudnn_enum;
  96. CudnnAlgoPack::Attr m_attr;
  97. public:
  98. AlgoCUDNN(cudnnConvolutionBwdDataAlgo_t cudnn_enum)
  99. : m_cudnn_enum(cudnn_enum) {
  100. megdnn_assert(CudnnAlgoPack::conv_bwd_data_algos().find(cudnn_enum) !=
  101. CudnnAlgoPack::conv_bwd_data_algos().end());
  102. m_attr = CudnnAlgoPack::conv_bwd_data_algos().at(cudnn_enum);
  103. }
  104. bool is_available(const SizeArgs& args) const override;
  105. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  106. void exec(const ExecArgs& args) const override;
  107. const char* name() const override { return m_attr.name.c_str(); }
  108. AlgoAttribute attribute() const override {
  109. auto ret = static_cast<AlgoAttribute>(0);
  110. if (m_attr.is_reproducible) {
  111. ret |= AlgoAttribute::REPRODUCIBLE;
  112. }
  113. return ret;
  114. }
  115. cudnnConvolutionBwdDataAlgo_t cudnn_enum() const { return m_cudnn_enum; }
  116. bool is_cudnn() const override { return true; }
  117. MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN)
  118. std::string param() const override {
  119. std::string ret;
  120. serialize_write_pod(m_cudnn_enum, ret);
  121. return ret;
  122. }
  123. };
  124. //! im2col and matmul, with dilation
  125. class ConvolutionBackwardDataImpl::AlgoMatmul final : public AlgoBase {
  126. template <typename T>
  127. static void exec_internal(const ExecArgs& args);
  128. public:
  129. bool is_available(const SizeArgs& args) const override;
  130. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  131. void exec(const ExecArgs& args) const override;
  132. std::vector<SearchItem> get_subopr_list(
  133. const TensorLayoutArray& layouts,
  134. const OperatorBase* opr) const override;
  135. const char* name() const override { return "MATMUL"; }
  136. MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
  137. AlgoAttribute attribute() const override {
  138. return AlgoAttribute::REPRODUCIBLE;
  139. }
  140. };
  141. class ConvolutionBackwardDataImpl::AlgoChanwise final : public AlgoBase {
  142. public:
  143. bool is_available(const SizeArgs& args) const override;
  144. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  145. void exec(const ExecArgs& args) const override;
  146. const char* name() const override { return "CHANNEL_WISE"; }
  147. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE)
  148. AlgoAttribute attribute() const override {
  149. return AlgoAttribute::REPRODUCIBLE;
  150. }
  151. };
  152. class ConvolutionBackwardDataImpl::AlgoChanwiseSmall final : public AlgoBase {
  153. public:
  154. bool is_available(const SizeArgs& args) const override;
  155. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  156. void exec(const ExecArgs& args) const override;
  157. const char* name() const override { return "CHANNEL_WISE_SMALL"; }
  158. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_SMALL)
  159. AlgoAttribute attribute() const override {
  160. return AlgoAttribute::REPRODUCIBLE;
  161. }
  162. };
  163. class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase {
  164. public:
  165. bool is_available(const SizeArgs& args) const override;
  166. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  167. void exec(const ExecArgs& args) const override;
  168. std::vector<SearchItem> get_subopr_list(
  169. const TensorLayoutArray& layouts,
  170. const OperatorBase* opr) const override;
  171. const char* name() const override {
  172. return "CONVOLUTION_BACKWARD_DATD_BFLOAT16";
  173. }
  174. AlgoAttribute attribute() const override {
  175. return AlgoAttribute::REPRODUCIBLE;
  176. }
  177. private:
  178. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  179. MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)
  180. };
  181. //! implement group conv by another algo
  182. class ConvolutionBackwardDataImpl::AlgoGroupConvGeneral final
  183. : public AlgoBase {
  184. AlgoBase* m_impl;
  185. std::string m_name;
  186. public:
  187. AlgoGroupConvGeneral(AlgoBase* impl);
  188. bool is_available(const SizeArgs& args) const override;
  189. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  190. void exec(const ExecArgs& args) const override;
  191. const char* name() const override { return m_name.c_str(); }
  192. static void modify_size_args(SizeArgs& args, TensorLayout& diff_pg,
  193. TensorLayout& grad_pg);
  194. MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)
  195. AlgoAttribute attribute() const override {
  196. auto ret = static_cast<AlgoAttribute>(0);
  197. if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) {
  198. ret |= AlgoAttribute::REPRODUCIBLE;
  199. }
  200. return ret;
  201. }
  202. std::string param() const override {
  203. std::string ret;
  204. serialize_write_pod(m_impl->name(), ret);
  205. return ret;
  206. }
  207. };
  208. class ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm final
  209. : public AlgoBase {
  210. public:
  211. struct AlgoParam {
  212. int threadblock_m;
  213. int threadblock_n;
  214. int threadblock_k;
  215. int warp_m;
  216. int warp_n;
  217. int warp_k;
  218. int stage;
  219. std::string to_string() {
  220. return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m,
  221. threadblock_n, threadblock_k, warp_m, warp_n,
  222. warp_k, stage);
  223. }
  224. };
  225. AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param)
  226. : m_algo_param{algo_param},
  227. m_name{ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
  228. m_algo_param.to_string().c_str())} {}
  229. bool is_available(const SizeArgs& args) const override;
  230. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  231. void exec(const ExecArgs& args) const override;
  232. const char* name() const override { return m_name.c_str(); }
  233. AlgoAttribute attribute() const override {
  234. return AlgoAttribute::REPRODUCIBLE;
  235. }
  236. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)
  237. private:
  238. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  239. const SizeArgs& args) const;
  240. AlgoParam m_algo_param;
  241. std::string m_name;
  242. };
  243. class ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm final
  244. : public AlgoBase {
  245. public:
  246. bool is_available(const SizeArgs& args) const override;
  247. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  248. void exec(const ExecArgs& args) const override;
  249. const char* name() const override {
  250. return "INT8_NCHW_DOTPROD_IMPLICIT_GEMM";
  251. }
  252. AlgoAttribute attribute() const override {
  253. return AlgoAttribute::REPRODUCIBLE;
  254. }
  255. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8);
  256. private:
  257. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  258. const SizeArgs& args) const;
  259. };
  260. class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {
  261. // defined in cudnn.cpp
  262. void fill_cudnn_algos();
  263. // defined in implicit_gemm_int8_nchw4_dp4a.cpp
  264. void fill_int8_dp4a_algos();
  265. AlgoBase::Mapper m_all_algos_map;
  266. public:
  267. AlgoPack();
  268. std::vector<AlgoCUDNN> cudnn;
  269. AlgoMatmul matmul;
  270. AlgoChanwise chanwise;
  271. AlgoChanwiseSmall chanwise_small;
  272. std::vector<AlgoGroupConvGeneral> gconv;
  273. std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv;
  274. AlgoBFloat16 bfloat16;
  275. std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
  276. AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod;
  277. std::vector<AlgoBase*>
  278. //! all algorithms
  279. all_algos,
  280. //! non-cudnn algos, used for heuristic if cudnn is not supported
  281. non_cudnn_algos, bfloat16_algos, int8_algos;
  282. AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdDataAlgo_t algo);
  283. const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
  284. };
  285. } // namespace cuda
  286. } // namespace megdnn
  287. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台