You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algo.h 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
  1. /**
  2. * \file dnn/src/cuda/conv_bias/algo.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include "megdnn/oprs.h"
  14. #include "src/common/algo_base.h"
  15. #include "src/common/metahelper.h"
  16. #include "src/common/utils.h"
  17. #include "src/cuda/conv_bias/conv_bias_int8.cuh"
  18. #include "src/cuda/conv_bias/helper.h"
  19. #include "src/cuda/conv_bias/opr_impl.h"
  20. #include "src/cuda/convolution_helper/parameter.cuh"
  21. #include "src/cuda/cudnn_wrapper.h"
  22. #include "src/cuda/handle.h"
  23. #include <cuda.h>
  24. #include <memory>
  25. #include <unordered_map>
  26. namespace megdnn {
  27. namespace cuda {
  28. /*!
  29. * \brief base class for conv bias algos
  30. *
  31. * All the algo impls should try to support non-contiguous batch dim, for group
  32. * conv execution.
  33. */
  34. class ConvBiasForwardImpl::AlgoBase : public Algorithm {
  35. protected:
  36. ~AlgoBase() = default;
  37. public:
  38. enum class AlgoType : uint32_t {
  39. CUDA_CUDNN_CONVBIAS,
  40. CUDA_CHANWISE,
  41. CUDA_CHANWISE_SMALL,
  42. CUDA_CHANWISE_INT8X8X32,
  43. CUDA_CUDNN_CONV,
  44. CUDA_INPLACE_MATMUL,
  45. CUDA_MATMUL,
  46. CUDA_MATMUL_INT8X8X32,
  47. CUDA_BATCHED_MATMUL,
  48. CUDA_GROUP_CONV_GENERAL,
  49. CUDA_WMMA_UINT4X4X32,
  50. CUDA_IMPLICIT_GEMM_CHWN4_DOTPROD_INT8,
  51. CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
  52. CUDA_IMPLICIT_GEMM_CHWN4_IMMA_INT8,
  53. CUDA_IMPLICIT_GEMM_NCHW4_IMMA_INT8,
  54. CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8,
  55. CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8,
  56. CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8,
  57. CUDA_BFLOAT16,
  58. CUDA_IMPLICIT_GEMM_SASS_NCHW4_DOTPROD_INT8,
  59. CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW4_DOTPROD_INT8,
  60. CUDA_IMPLICIT_GEMM_SASS_NCHW32_IMMA_INT8,
  61. CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8,
  62. };
  63. using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
  64. AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
  65. struct SizeArgs : public conv_bias::BiasForwardSizeArgs {
  66. ConvBiasForwardImpl* opr;
  67. const PreprocessedFilter* preprocessed_filter;
  68. std::string to_string() const;
  69. SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src,
  70. const TensorLayout& filter, const TensorLayout& bias,
  71. const TensorLayout& z, const TensorLayout& dst,
  72. const PreprocessedFilter* preprocessed_filter = nullptr);
  73. SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src,
  74. const TensorLayout& filter,
  75. const CanonizedFilterMeta& filter_meta,
  76. const TensorLayout& bias, const TensorLayout& z,
  77. const TensorLayout& dst,
  78. const PreprocessedFilter* preprocessed_filter = nullptr);
  79. void init_conv_bias_desc(conv_bias::CUDNNForwardDescs& desc) const {
  80. desc.set_conv_bias(*src_layout, filter_meta, *dst_layout,
  81. *bias_layout, *z_layout, opr->param());
  82. }
  83. void init_conv_desc(conv_bias::CUDNNForwardDescs& desc) const {
  84. desc.set_conv(*src_layout, filter_meta, *dst_layout, opr->param());
  85. }
  86. };
  87. struct ExecArgs : public SizeArgs {
  88. const TensorND *src_tensor, *filter_tensor, *bias_tensor, *z_tensor,
  89. *dst_tensor;
  90. Workspace workspace;
  91. ExecArgs(ConvBiasForwardImpl* opr, _megdnn_tensor_in src,
  92. _megdnn_tensor_in filter, _megdnn_tensor_in bias,
  93. _megdnn_tensor_in z, _megdnn_tensor_out dst,
  94. _megdnn_workspace workspace,
  95. const PreprocessedFilter* preprocessed_filter = nullptr);
  96. };
  97. virtual bool is_available(const SizeArgs& args) const = 0;
  98. virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
  99. virtual void exec(const ExecArgs& args) const = 0;
  100. virtual size_t get_preprocess_workspace_in_bytes(
  101. const SizeArgs& args) const {
  102. MEGDNN_MARK_USED_VAR(args);
  103. return 0;
  104. }
  105. virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
  106. const SizeArgs& args) const {
  107. MEGDNN_MARK_USED_VAR(args);
  108. return {};
  109. }
  110. virtual void exec_preprocess(const ExecArgs& args) const {
  111. MEGDNN_MARK_USED_VAR(args);
  112. }
  113. bool is_available_wk(const SizeArgs& args, size_t limit) {
  114. return is_available(args) && get_workspace_in_bytes(args) <= limit;
  115. }
  116. bool is_available_reproducible(
  117. const SizeArgs& args, bool reproducible = true,
  118. size_t limit = std::numeric_limits<size_t>::max()) {
  119. return (!reproducible ||
  120. contain_attribute(AlgoAttribute::REPRODUCIBLE)) &&
  121. is_available_wk(args, limit);
  122. }
  123. AlgoBase& check_workspace(const SizeArgs& args,
  124. const Workspace& workspace) {
  125. auto req = get_workspace_in_bytes(args);
  126. megdnn_assert(
  127. req <= workspace.size,
  128. "conv bias fwd algo %s: required workspace %zu bytes, got %zu",
  129. name(), req, workspace.size);
  130. return *this;
  131. }
  132. virtual bool is_cudnn() const { return false; }
  133. };
  134. class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation final : public AlgoBase {
  135. public:
  136. AlgoCUDNNConvBiasActivation(cudnnConvolutionFwdAlgo_t cudnn_enum)
  137. : m_cudnn_enum(cudnn_enum) {
  138. megdnn_assert(CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) !=
  139. CudnnAlgoPack::conv_fwd_algos().end());
  140. m_attr = CudnnAlgoPack::conv_fwd_algos().at(cudnn_enum);
  141. m_name = ConvBiasForward::algo_name<DefaultParam>(
  142. "CUDNN:ConvBiasActivation:" + m_attr.name, {});
  143. }
  144. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  145. void exec(const ExecArgs& args) const override;
  146. param::Convolution get_param_convolution(const SizeArgs& args) const;
  147. bool is_available(const SizeArgs&) const override;
  148. const char* name() const override { return m_name.c_str(); }
  149. AlgoAttribute attribute() const override {
  150. auto ret = static_cast<AlgoAttribute>(0);
  151. if (m_attr.is_reproducible) {
  152. ret |= AlgoAttribute::REPRODUCIBLE;
  153. }
  154. return ret;
  155. }
  156. cudnnConvolutionFwdAlgo_t cudnn_enum() { return m_cudnn_enum; }
  157. bool is_cudnn() const override { return true; }
  158. MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONVBIAS)
  159. std::string param() const override {
  160. std::string ret;
  161. serialize_write_pod(m_cudnn_enum, ret);
  162. return ret;
  163. }
  164. private:
  165. std::string m_name;
  166. cudnnConvolutionFwdAlgo_t m_cudnn_enum;
  167. CudnnAlgoPack::Attr m_attr;
  168. };
  169. class ConvBiasForwardImpl::AlgoChanwise final : public AlgoBase {
  170. public:
  171. bool is_available(const SizeArgs& args) const override;
  172. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  173. void exec(const ExecArgs& args) const override;
  174. const char* name() const override {
  175. if (m_name.empty()) {
  176. m_name =
  177. ConvBiasForward::algo_name<DirectParam>("CHANNEL_WISE", {});
  178. }
  179. return m_name.c_str();
  180. }
  181. AlgoAttribute attribute() const override {
  182. return AlgoAttribute::REPRODUCIBLE;
  183. }
  184. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE)
  185. private:
  186. mutable std::string m_name;
  187. };
  188. class ConvBiasForwardImpl::AlgoChanwiseSmall final : public AlgoBase {
  189. public:
  190. bool is_available(const SizeArgs& args) const override;
  191. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  192. void exec(const ExecArgs& args) const override;
  193. const char* name() const override {
  194. if (m_name.empty()) {
  195. m_name = ConvBiasForward::algo_name<DirectParam>(
  196. "CHANNEL_WISE_SMALL", {});
  197. }
  198. return m_name.c_str();
  199. }
  200. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_SMALL)
  201. AlgoAttribute attribute() const override {
  202. return AlgoAttribute::REPRODUCIBLE;
  203. }
  204. private:
  205. mutable std::string m_name;
  206. };
  207. class ConvBiasForwardImpl::AlgoChanwise8x8x32 final : public AlgoBase {
  208. public:
  209. bool is_available(const SizeArgs& args) const override;
  210. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  211. void exec(const ExecArgs& args) const override;
  212. const char* name() const override {
  213. if (m_name.empty()) {
  214. m_name = ConvBiasForward::algo_name<DirectParam>(
  215. "CHANNEL_WISE_8X8X32", {});
  216. }
  217. return m_name.c_str();
  218. }
  219. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_INT8X8X32)
  220. AlgoAttribute attribute() const override {
  221. return AlgoAttribute::REPRODUCIBLE;
  222. }
  223. private:
  224. mutable std::string m_name;
  225. };
  226. class ConvBiasForwardImpl::AlgoCUDNNConv final : public AlgoBase {
  227. public:
  228. AlgoCUDNNConv(cudnnConvolutionFwdAlgo_t cudnn_enum)
  229. : m_cudnn_enum(cudnn_enum) {
  230. megdnn_assert(CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) !=
  231. CudnnAlgoPack::conv_fwd_algos().end());
  232. m_attr = CudnnAlgoPack::conv_fwd_algos().at(cudnn_enum);
  233. m_name = ConvBiasForward::algo_name<DefaultParam>(
  234. "CUDNN:Convolution:" + m_attr.name, {});
  235. }
  236. bool is_available(const SizeArgs& args) const override;
  237. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  238. void exec(const ExecArgs& args) const override;
  239. AlgoAttribute attribute() const override {
  240. auto ret = static_cast<AlgoAttribute>(0);
  241. if (m_attr.is_reproducible) {
  242. ret |= AlgoAttribute::REPRODUCIBLE;
  243. }
  244. return ret;
  245. }
  246. const char* name() const override { return m_name.c_str(); }
  247. cudnnConvolutionFwdAlgo_t cudnn_enum() const { return m_cudnn_enum; }
  248. bool is_cudnn() const override { return true; }
  249. MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONV)
  250. std::string param() const override {
  251. std::string ret;
  252. serialize_write_pod(m_cudnn_enum, ret);
  253. return ret;
  254. }
  255. private:
  256. std::string m_name;
  257. cudnnConvolutionFwdAlgo_t m_cudnn_enum;
  258. CudnnAlgoPack::Attr m_attr;
  259. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  260. };
  261. //! compute small matmul in the kernel
  262. class ConvBiasForwardImpl::AlgoInplaceMatmul final : public AlgoBase {
  263. public:
  264. bool is_available(const SizeArgs& args) const override;
  265. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  266. void exec(const ExecArgs& args) const override;
  267. const char* name() const override {
  268. if (m_name.empty()) {
  269. m_name = ConvBiasForward::algo_name<ConvBias::MatmulParam>(
  270. "INPLACE_MATMUL", {});
  271. }
  272. return m_name.c_str();
  273. }
  274. MEGDNN_DECL_ALGO_TYPE(CUDA_INPLACE_MATMUL)
  275. AlgoAttribute attribute() const override {
  276. return AlgoAttribute::REPRODUCIBLE;
  277. }
  278. private:
  279. mutable std::string m_name;
  280. };
  281. //! im2col and matmul, with dilation
  282. class ConvBiasForwardImpl::AlgoMatmul final : public AlgoBase {
  283. template <typename T>
  284. static void exec_internal(const ExecArgs& args,
  285. const WorkspaceBundle& bundle);
  286. public:
  287. bool is_available(const SizeArgs& args) const override;
  288. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  289. void exec(const ExecArgs& args) const override;
  290. const char* name() const override {
  291. if (m_name.empty()) {
  292. m_name = ConvBiasForward::algo_name<ConvBias::MatmulParam>("MATMUL",
  293. {});
  294. }
  295. return m_name.c_str();
  296. }
  297. std::vector<SearchItem> get_subopr_list(
  298. const TensorLayoutArray& layouts,
  299. const OperatorBase* opr) const override;
  300. MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
  301. AlgoAttribute attribute() const override {
  302. return AlgoAttribute::REPRODUCIBLE;
  303. }
  304. private:
  305. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  306. mutable std::string m_name;
  307. };
  308. class ConvBiasForwardImpl::AlgoMatmul8x8x32 final : public AlgoBase {
  309. public:
  310. bool is_available(const SizeArgs& args) const override;
  311. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  312. void exec(const ExecArgs& args) const override;
  313. const char* name() const override {
  314. if (m_name.empty()) {
  315. m_name = ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  316. "MATMUL8X8X32", {});
  317. }
  318. return m_name.c_str();
  319. }
  320. MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL_INT8X8X32)
  321. AlgoAttribute attribute() const override {
  322. return AlgoAttribute::REPRODUCIBLE;
  323. }
  324. private:
  325. bool need_src_unroll(const SizeArgs& args) const;
  326. bool need_filter_reshape(const SizeArgs& args) const;
  327. template <Param::Format>
  328. WorkspaceBundle get_bundle(const SizeArgs& args) const;
  329. template <Param::Format>
  330. void exec_internal(const ExecArgs& args) const;
  331. mutable std::string m_name;
  332. };
  333. class ConvBiasForwardImpl::AlgoBatchedMatmul final : public AlgoBase {
  334. public:
  335. bool is_available(const SizeArgs& args) const override;
  336. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  337. void exec(const ExecArgs& args) const override;
  338. const char* name() const override {
  339. if (m_name.empty()) {
  340. m_name = ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  341. "BATCHED_MATMUL", {});
  342. }
  343. return m_name.c_str();
  344. }
  345. std::vector<SearchItem> get_subopr_list(
  346. const TensorLayoutArray& layouts,
  347. const OperatorBase* opr) const override;
  348. AlgoAttribute attribute() const override {
  349. return AlgoAttribute::REPRODUCIBLE;
  350. }
  351. MEGDNN_DECL_ALGO_TYPE(CUDA_BATCHED_MATMUL)
  352. private:
  353. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  354. mutable std::string m_name;
  355. };
  356. //! implement group conv by another algo
  357. class ConvBiasForwardImpl::AlgoGroupConvGeneral final : public AlgoBase {
  358. public:
  359. AlgoGroupConvGeneral(AlgoBase* impl);
  360. bool is_available(const SizeArgs& args) const override;
  361. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  362. void exec(const ExecArgs& args) const override;
  363. const char* name() const override { return m_name.c_str(); }
  364. AlgoAttribute attribute() const override {
  365. auto ret = static_cast<AlgoAttribute>(0);
  366. if (m_impl->contain_attribute(AlgoAttribute::REPRODUCIBLE)) {
  367. ret |= AlgoAttribute::REPRODUCIBLE;
  368. }
  369. return ret;
  370. }
  371. static void modify_size_args(SizeArgs& args, TensorLayout& src_pg,
  372. TensorLayout& dst_pg, TensorLayout& bias_pg);
  373. MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)
  374. std::string param() const override {
  375. std::string ret;
  376. serialize_write_pod(m_impl->name(), ret);
  377. return ret;
  378. }
  379. private:
  380. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  381. AlgoBase* m_impl;
  382. std::string m_name;
  383. };
  384. #if CUDA_VERSION >= 10000
  385. class ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA final : public AlgoBase {
  386. public:
  387. AlgoQUInt4x4x32WMMA() = default;
  388. bool is_available(const SizeArgs& args) const override;
  389. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  390. void exec(const ExecArgs& args) const override;
  391. const char* name() const override { return "QUINT4x4x32_WMMA"; }
  392. AlgoAttribute attribute() const override {
  393. return AlgoAttribute::REPRODUCIBLE;
  394. }
  395. private:
  396. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  397. const SizeArgs& args) const;
  398. bool use_kernel_fhxfw(const SizeArgs& args) const;
  399. size_t get_workspace_in_bytes_do_conv(const SizeArgs& args) const;
  400. MEGDNN_DECL_ALGO_TYPE(CUDA_WMMA_UINT4X4X32)
  401. };
  402. #endif
  403. class ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm final
  404. : public AlgoBase {
  405. public:
  406. AlgoInt8CHWN4DotProdImplicitGemm() = default;
  407. bool is_available(const SizeArgs& args) const override;
  408. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  409. void exec(const ExecArgs& args) const override;
  410. const char* name() const override {
  411. return "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM";
  412. }
  413. AlgoAttribute attribute() const override {
  414. return AlgoAttribute::REPRODUCIBLE;
  415. }
  416. template <typename BiasVisitor>
  417. static void dispatch_nonlinear_mode(
  418. const int8_t* d_src, const int8_t* d_filter,
  419. BiasVisitor bias_visitor, const int8_t* d_z, int8_t* d_dst,
  420. const convolution::ConvParam& param, float alpha, float beta,
  421. float gamma, float scale, cudaStream_t stream,
  422. param::ConvBias::NonlineMode nonlinear_mode);
  423. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_CHWN4_DOTPROD_INT8)
  424. };
  425. class ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm final
  426. : public AlgoBase {
  427. public:
  428. struct AlgoParam {
  429. int threadblock_m;
  430. int threadblock_n;
  431. int threadblock_k;
  432. int warp_m;
  433. int warp_n;
  434. int warp_k;
  435. int stage;
  436. std::string to_string() {
  437. /// default algorithm
  438. if (threadblock_m == 128 && threadblock_n == 128 &&
  439. threadblock_k == 32 && warp_m == 32 && warp_n == 64 &&
  440. warp_k == 32 && stage == 2) {
  441. return "";
  442. }
  443. return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m,
  444. threadblock_n, threadblock_k, warp_m, warp_n,
  445. warp_k, stage);
  446. }
  447. };
  448. AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param)
  449. : m_algo_param{algo_param},
  450. m_name{ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
  451. m_algo_param.to_string().c_str())} {}
  452. bool is_available(const SizeArgs& args) const override;
  453. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  454. void exec(const ExecArgs& args) const override;
  455. const char* name() const override { return m_name.c_str(); }
  456. AlgoAttribute attribute() const override {
  457. return AlgoAttribute::REPRODUCIBLE;
  458. }
  459. size_t get_preprocess_workspace_in_bytes(
  460. const SizeArgs& args) const override;
  461. SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
  462. const SizeArgs& args) const override;
  463. void exec_preprocess(const ExecArgs& args) const override;
  464. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)
  465. std::string param() const override {
  466. std::string ret;
  467. serialize_write_pod(m_algo_param, ret);
  468. return ret;
  469. }
  470. private:
  471. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  472. const SizeArgs& args) const;
  473. AlgoParam m_algo_param;
  474. std::string m_name;
  475. };
  476. class ConvBiasForwardImpl::AlgoFallbackNCHWQS8 final : public AlgoBase {
  477. public:
  478. bool is_available(const SizeArgs& args) const override;
  479. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  480. void exec(const ExecArgs& args) const override;
  481. const char* name() const override {
  482. return "FALLBACK_CONV_NCHW_QS8";
  483. }
  484. AlgoAttribute attribute() const override {
  485. return AlgoAttribute::REPRODUCIBLE;
  486. }
  487. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)
  488. private:
  489. void make_inner_layout(const SizeArgs& args, TensorLayout& inner_src_layout,
  490. TensorLayout& inner_weight_layout,
  491. TensorLayout& inner_dst_layout,
  492. TensorLayout& inner_bias_layout,
  493. TensorLayout& inner_z_layout) const;
  494. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  495. };
  496. #if CUDA_VERSION >= 10000
  497. class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm final
  498. : public AlgoBase {
  499. public:
  500. enum class MMATileSize : uint32_t {
  501. IMMA16x16x16,
  502. IMMA32x8x16,
  503. IMMA8x32x16
  504. };
  505. AlgoInt8CHWN4IMMAImplicitGemm(MMATileSize mma_tile_size)
  506. : m_mma_tile_size{mma_tile_size},
  507. m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_" +
  508. to_string(m_mma_tile_size)} {}
  509. bool is_available(const SizeArgs& args) const override;
  510. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  511. void exec(const ExecArgs& args) const override;
  512. const char* name() const override { return m_name.c_str(); }
  513. AlgoAttribute attribute() const override {
  514. return AlgoAttribute::REPRODUCIBLE;
  515. }
  516. template <typename BiasVisitor>
  517. static void dispatch_nonlinear_mode(
  518. const int8_t* d_src, const int8_t* d_filter,
  519. BiasVisitor bias_visitor, int8_t* d_z, int8_t* d_dst,
  520. const convolution::ConvParam& param, float alpha, float beta,
  521. float gamma, float scale, cudaStream_t stream,
  522. param::ConvBias::NonlineMode nonlinear_mode,
  523. MMATileSize mma_tile_size);
  524. static std::string to_string(MMATileSize mma_tile_size);
  525. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_CHWN4_IMMA_INT8)
  526. std::string param() const override {
  527. std::string ret;
  528. serialize_write_pod(m_mma_tile_size, ret);
  529. return ret;
  530. }
  531. private:
  532. MMATileSize m_mma_tile_size;
  533. std::string m_name;
  534. };
  535. class ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm final
  536. : public AlgoBase {
  537. public:
  538. using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
  539. AlgoInt8NCHW4IMMAImplicitGemm(MMATileSize mma_tile_size)
  540. : m_mma_tile_size{mma_tile_size},
  541. m_name{"INT8_NCHW4_IMMA_IMPLICIT_GEMM_" +
  542. AlgoInt8CHWN4IMMAImplicitGemm::to_string(
  543. m_mma_tile_size)} {}
  544. bool is_available(const SizeArgs& args) const override;
  545. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  546. void exec(const ExecArgs& args) const override;
  547. const char* name() const override { return m_name.c_str(); }
  548. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_IMMA_INT8)
  549. std::string param() const override {
  550. std::string ret;
  551. serialize_write_pod(m_mma_tile_size, ret);
  552. return ret;
  553. }
  554. AlgoAttribute attribute() const override {
  555. return AlgoAttribute::REPRODUCIBLE;
  556. }
  557. private:
  558. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  559. const SizeArgs& args) const;
  560. MMATileSize m_mma_tile_size;
  561. std::string m_name;
  562. };
  563. class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter final
  564. : public AlgoBase {
  565. public:
  566. using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
  567. AlgoInt8CHWN4IMMAImplicitGemmReorderFilter(MMATileSize mma_tile_size)
  568. : m_mma_tile_size{mma_tile_size},
  569. m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_" +
  570. AlgoInt8CHWN4IMMAImplicitGemm::to_string(
  571. m_mma_tile_size)} {}
  572. bool is_available(const SizeArgs& args) const override;
  573. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  574. void exec(const ExecArgs& args) const override;
  575. const char* name() const override { return m_name.c_str(); }
  576. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8)
  577. std::string param() const override {
  578. std::string ret;
  579. serialize_write_pod(m_mma_tile_size, ret);
  580. return ret;
  581. }
  582. AlgoAttribute attribute() const override {
  583. return AlgoAttribute::REPRODUCIBLE;
  584. }
  585. private:
  586. MMATileSize m_mma_tile_size;
  587. std::string m_name;
  588. };
  589. class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth final
  590. : public AlgoBase {
  591. public:
  592. using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
  593. AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth(MMATileSize mma_tile_size)
  594. : m_mma_tile_size{mma_tile_size},
  595. m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_" +
  596. AlgoInt8CHWN4IMMAImplicitGemm::to_string(
  597. m_mma_tile_size)} {}
  598. bool is_available(const SizeArgs& args) const override;
  599. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  600. void exec(const ExecArgs& args) const override;
  601. const char* name() const override { return m_name.c_str(); }
  602. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8)
  603. std::string param() const override {
  604. std::string ret;
  605. serialize_write_pod(m_mma_tile_size, ret);
  606. return ret;
  607. }
  608. AlgoAttribute attribute() const override {
  609. return AlgoAttribute::REPRODUCIBLE;
  610. }
  611. private:
  612. MMATileSize m_mma_tile_size;
  613. std::string m_name;
  614. };
  615. #endif
  616. #if CUDA_VERSION >= 10020
  617. class ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm final
  618. : public AlgoBase {
  619. public:
  620. struct AlgoParam {
  621. int threadblock_m;
  622. int threadblock_n;
  623. int threadblock_k;
  624. int warp_m;
  625. int warp_n;
  626. int warp_k;
  627. };
  628. AlgoInt8NCHW32IMMAImplicitGemm(AlgoParam algo_param)
  629. : m_algo_param{algo_param} {
  630. m_name = ConvBias::algo_name<ConvBias::DirectParam>(
  631. ssprintf("INT8_NCHW32_IMMA_IMPLICIT_GEMM_%s",
  632. to_string(m_algo_param).c_str()),
  633. ConvBias::DirectParam{});
  634. }
  635. bool is_available(const SizeArgs& args) const override;
  636. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  637. void exec(const ExecArgs& args) const override;
  638. const char* name() const override { return m_name.c_str(); }
  639. AlgoAttribute attribute() const override {
  640. return AlgoAttribute::REPRODUCIBLE;
  641. }
  642. static std::string to_string(AlgoParam algo_param);
  643. size_t get_preprocess_workspace_in_bytes(
  644. const SizeArgs& args) const override;
  645. SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
  646. const SizeArgs& args) const override;
  647. void exec_preprocess(const ExecArgs& args) const override;
  648. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8)
  649. std::string param() const override {
  650. std::string ret;
  651. serialize_write_pod(m_algo_param, ret);
  652. return ret;
  653. }
  654. private:
  655. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  656. const SizeArgs& args) const;
  657. AlgoParam m_algo_param;
  658. std::string m_name;
  659. };
  660. #endif
  661. class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase {
  662. public:
  663. bool is_available(const SizeArgs& args) const override;
  664. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  665. void exec(const ExecArgs& args) const override;
  666. std::vector<SearchItem> get_subopr_list(
  667. const TensorLayoutArray& layouts,
  668. const OperatorBase* opr) const override;
  669. const char* name() const override { return "CONVBIAS_BFLOAT16"; }
  670. AlgoAttribute attribute() const override {
  671. return AlgoAttribute::REPRODUCIBLE;
  672. }
  673. MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)
  674. private:
  675. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  676. };
  677. class ConvBiasForwardImpl::AlgoPack : NonCopyableObj {
  678. private:
  679. AlgoBase::Mapper m_all_algos_map;
  680. public:
  681. AlgoPack();
  682. std::vector<AlgoBase*> all_algos,
  683. //! non-cudnn algos, used for heuristic if cudnn is not supported
  684. non_cudnn_algos, bfloat16_algos;
  685. std::vector<AlgoCUDNNConvBiasActivation> cudnn_conv_bias_activations;
  686. std::vector<AlgoCUDNNConv> cudnn_convs;
  687. AlgoFallbackNCHWQS8 fallback_nchw_qs8;
  688. AlgoChanwise chanwise;
  689. AlgoChanwiseSmall chanwise_small;
  690. AlgoChanwise8x8x32 chanwise8x8x32;
  691. AlgoInplaceMatmul inplace_matmul;
  692. AlgoMatmul matmul;
  693. AlgoMatmul8x8x32 matmul8x8x32;
  694. AlgoBatchedMatmul batched_matmul;
  695. std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
  696. AlgoInt8CHWN4DotProdImplicitGemm int8_chwn4_dotprod;
  697. #if CUDA_VERSION >= 10000
  698. AlgoQUInt4x4x32WMMA wmma_quint4x4x32;
  699. std::vector<AlgoInt8CHWN4IMMAImplicitGemm> int8_chwn4_imma;
  700. std::vector<AlgoInt8NCHW4IMMAImplicitGemm> int8_nchw4_imma;
  701. std::vector<AlgoInt8CHWN4IMMAImplicitGemmReorderFilter>
  702. int8_chwn4_imma_reorder_filter;
  703. std::vector<AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth>
  704. int8_chwn4_imma_unroll_width;
  705. #endif
  706. #if CUDA_VERSION >= 10020
  707. std::vector<AlgoInt8NCHW32IMMAImplicitGemm> int8_nchw32_imma;
  708. #endif
  709. std::vector<std::unique_ptr<AlgoGroupConvGeneral>> gconv_refhold;
  710. AlgoBFloat16 bfloat16;
  711. std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv;
  712. AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo);
  713. AlgoBase* cudnn_conv_from_enum(cudnnConvolutionFwdAlgo_t algo);
  714. const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
  715. private:
  716. #if CUDA_VERSION >= 10000
  717. void fill_imma_algos();
  718. #endif
  719. void fill_cudnn_algos();
  720. void fill_dp4a_algos();
  721. };
  722. } // namespace cuda
  723. } // namespace megdnn
  724. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台