You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algo.h 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842
  1. /**
  2. * \file dnn/src/cuda/conv_bias/algo.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include "megdnn/oprs.h"
  14. #include "src/common/algo_base.h"
  15. #include "src/common/metahelper.h"
  16. #include "src/common/utils.h"
  17. #include "src/cuda/conv_bias/conv_bias_int8.cuh"
  18. #include "src/cuda/conv_bias/helper.h"
  19. #include "src/cuda/conv_bias/opr_impl.h"
  20. #include "src/cuda/convolution_helper/parameter.cuh"
  21. #include "src/cuda/cudnn_wrapper.h"
  22. #include "src/cuda/handle.h"
  23. #include <cuda.h>
  24. #include <memory>
  25. #include <unordered_map>
  26. namespace megdnn {
  27. namespace cuda {
  28. /*!
  29. * \brief base class for conv bias algos
  30. *
  31. * All the algo impls should try to support non-contiguous batch dim, for group
  32. * conv execution.
  33. */
  34. class ConvBiasForwardImpl::AlgoBase : public Algorithm {
  35. protected:
  36. ~AlgoBase() = default;
  37. public:
  38. enum class AlgoType : uint32_t {
  39. CUDA_CUDNN_CONVBIAS,
  40. CUDA_CHANWISE,
  41. CUDA_CHANWISE_SMALL,
  42. CUDA_CHANWISE_INT8X8X32,
  43. CUDA_CUDNN_CONV,
  44. CUDA_INPLACE_MATMUL,
  45. CUDA_MATMUL,
  46. CUDA_MATMUL_INT8X8X32,
  47. CUDA_BATCHED_MATMUL,
  48. CUDA_GROUP_CONV_GENERAL,
  49. CUDA_WMMA_UINT4X4X32,
  50. CUDA_IMPLICIT_GEMM_CHWN4_DOTPROD_INT8,
  51. CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
  52. CUDA_IMPLICIT_GEMM_CHWN4_IMMA_INT8,
  53. CUDA_IMPLICIT_GEMM_NCHW4_IMMA_INT8,
  54. CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8,
  55. CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8,
  56. CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8,
  57. CUDA_BFLOAT16,
  58. CUDA_IMPLICIT_GEMM_SASS_NCHW4_DOTPROD_INT8,
  59. CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW4_DOTPROD_INT8,
  60. CUDA_IMPLICIT_GEMM_SASS_NCHW32_IMMA_INT8,
  61. CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8,
  62. };
  63. using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
  64. AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
  65. struct SizeArgs : public conv_bias::BiasForwardSizeArgs {
  66. ConvBiasForwardImpl* opr;
  67. const PreprocessedFilter* preprocessed_filter;
  68. std::string to_string() const;
  69. SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src,
  70. const TensorLayout& filter, const TensorLayout& bias,
  71. const TensorLayout& z, const TensorLayout& dst,
  72. const PreprocessedFilter* preprocessed_filter = nullptr);
  73. SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src,
  74. const TensorLayout& filter,
  75. const CanonizedFilterMeta& filter_meta,
  76. const TensorLayout& bias, const TensorLayout& z,
  77. const TensorLayout& dst,
  78. const PreprocessedFilter* preprocessed_filter = nullptr);
  79. void init_conv_bias_desc(conv_bias::CUDNNForwardDescs& desc) const {
  80. desc.set_conv_bias(*src_layout, filter_meta, *dst_layout,
  81. *bias_layout, *z_layout, opr->param());
  82. }
  83. void init_conv_desc(conv_bias::CUDNNForwardDescs& desc) const {
  84. desc.set_conv(*src_layout, filter_meta, *dst_layout, opr->param());
  85. }
  86. };
  87. struct ExecArgs : public SizeArgs {
  88. const TensorND *src_tensor, *filter_tensor, *bias_tensor, *z_tensor,
  89. *dst_tensor;
  90. Workspace workspace;
  91. ExecArgs(ConvBiasForwardImpl* opr, _megdnn_tensor_in src,
  92. _megdnn_tensor_in filter, _megdnn_tensor_in bias,
  93. _megdnn_tensor_in z, _megdnn_tensor_out dst,
  94. _megdnn_workspace workspace,
  95. const PreprocessedFilter* preprocessed_filter = nullptr);
  96. };
  97. virtual bool is_available(const SizeArgs& args) const = 0;
  98. virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
  99. virtual void exec(const ExecArgs& args) const = 0;
  100. virtual size_t get_preprocess_workspace_in_bytes(
  101. const SizeArgs& args) const {
  102. MEGDNN_MARK_USED_VAR(args);
  103. return 0;
  104. }
  105. virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
  106. const SizeArgs& args) const {
  107. MEGDNN_MARK_USED_VAR(args);
  108. return {};
  109. }
  110. virtual void exec_preprocess(const ExecArgs& args) const {
  111. MEGDNN_MARK_USED_VAR(args);
  112. }
  113. bool is_available_wk(const SizeArgs& args, size_t limit) {
  114. return is_available(args) && get_workspace_in_bytes(args) <= limit;
  115. }
  116. bool is_available_attribute(
  117. const SizeArgs& args,
  118. const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
  119. const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT,
  120. size_t limit = std::numeric_limits<size_t>::max()) {
  121. return contain_attribute_all(positive_attr) &&
  122. !contain_attribute_any(negative_attr) &&
  123. is_available_wk(args, limit);
  124. }
  125. AlgoBase& check_workspace(const SizeArgs& args,
  126. const Workspace& workspace) {
  127. auto req = get_workspace_in_bytes(args);
  128. megdnn_assert(
  129. req <= workspace.size,
  130. "conv bias fwd algo %s: required workspace %zu bytes, got %zu",
  131. name(), req, workspace.size);
  132. return *this;
  133. }
  134. virtual bool is_cudnn() const { return false; }
  135. };
  136. class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation final : public AlgoBase {
  137. public:
  138. AlgoCUDNNConvBiasActivation(cudnnConvolutionFwdAlgo_t cudnn_enum)
  139. : m_cudnn_enum(cudnn_enum) {
  140. megdnn_assert(CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) !=
  141. CudnnAlgoPack::conv_fwd_algos().end());
  142. m_attr = CudnnAlgoPack::conv_fwd_algos().at(cudnn_enum);
  143. m_name = ConvBiasForward::algo_name<DefaultParam>(
  144. "CUDNN:ConvBiasActivation:" + m_attr.name, {});
  145. }
  146. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  147. void exec(const ExecArgs& args) const override;
  148. param::Convolution get_param_convolution(const SizeArgs& args) const;
  149. bool is_available(const SizeArgs&) const override;
  150. const char* name() const override { return m_name.c_str(); }
  151. AlgoAttribute attribute() const override {
  152. auto ret = static_cast<AlgoAttribute>(0);
  153. if (m_attr.is_reproducible) {
  154. ret |= AlgoAttribute::REPRODUCIBLE;
  155. }
  156. if (m_attr.accuracy_depend_on_batch) {
  157. ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
  158. }
  159. return ret;
  160. }
  161. cudnnConvolutionFwdAlgo_t cudnn_enum() { return m_cudnn_enum; }
  162. bool is_cudnn() const override { return true; }
  163. MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONVBIAS)
  164. std::string param() const override {
  165. std::string ret;
  166. serialize_write_pod(m_cudnn_enum, ret);
  167. return ret;
  168. }
  169. private:
  170. std::string m_name;
  171. cudnnConvolutionFwdAlgo_t m_cudnn_enum;
  172. CudnnAlgoPack::Attr m_attr;
  173. };
  174. class ConvBiasForwardImpl::AlgoChanwise final : public AlgoBase {
  175. public:
  176. bool is_available(const SizeArgs& args) const override;
  177. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  178. void exec(const ExecArgs& args) const override;
  179. const char* name() const override {
  180. if (m_name.empty()) {
  181. m_name =
  182. ConvBiasForward::algo_name<DirectParam>("CHANNEL_WISE", {});
  183. }
  184. return m_name.c_str();
  185. }
  186. AlgoAttribute attribute() const override {
  187. return AlgoAttribute::REPRODUCIBLE;
  188. }
  189. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE)
  190. private:
  191. mutable std::string m_name;
  192. };
  193. class ConvBiasForwardImpl::AlgoChanwiseSmall final : public AlgoBase {
  194. public:
  195. bool is_available(const SizeArgs& args) const override;
  196. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  197. void exec(const ExecArgs& args) const override;
  198. const char* name() const override {
  199. if (m_name.empty()) {
  200. m_name = ConvBiasForward::algo_name<DirectParam>(
  201. "CHANNEL_WISE_SMALL", {});
  202. }
  203. return m_name.c_str();
  204. }
  205. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_SMALL)
  206. AlgoAttribute attribute() const override {
  207. return AlgoAttribute::REPRODUCIBLE;
  208. }
  209. private:
  210. mutable std::string m_name;
  211. };
  212. class ConvBiasForwardImpl::AlgoChanwise8x8x32 final : public AlgoBase {
  213. public:
  214. bool is_available(const SizeArgs& args) const override;
  215. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  216. void exec(const ExecArgs& args) const override;
  217. const char* name() const override {
  218. if (m_name.empty()) {
  219. m_name = ConvBiasForward::algo_name<DirectParam>(
  220. "CHANNEL_WISE_8X8X32", {});
  221. }
  222. return m_name.c_str();
  223. }
  224. MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_INT8X8X32)
  225. AlgoAttribute attribute() const override {
  226. return AlgoAttribute::REPRODUCIBLE;
  227. }
  228. private:
  229. mutable std::string m_name;
  230. };
  231. class ConvBiasForwardImpl::AlgoCUDNNConv final : public AlgoBase {
  232. public:
  233. AlgoCUDNNConv(cudnnConvolutionFwdAlgo_t cudnn_enum)
  234. : m_cudnn_enum(cudnn_enum) {
  235. megdnn_assert(CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) !=
  236. CudnnAlgoPack::conv_fwd_algos().end());
  237. m_attr = CudnnAlgoPack::conv_fwd_algos().at(cudnn_enum);
  238. m_name = ConvBiasForward::algo_name<DefaultParam>(
  239. "CUDNN:Convolution:" + m_attr.name, {});
  240. }
  241. bool is_available(const SizeArgs& args) const override;
  242. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  243. void exec(const ExecArgs& args) const override;
  244. AlgoAttribute attribute() const override {
  245. auto ret = static_cast<AlgoAttribute>(0);
  246. if (m_attr.is_reproducible) {
  247. ret |= AlgoAttribute::REPRODUCIBLE;
  248. }
  249. if (m_attr.accuracy_depend_on_batch) {
  250. ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
  251. }
  252. return ret;
  253. }
  254. const char* name() const override { return m_name.c_str(); }
  255. cudnnConvolutionFwdAlgo_t cudnn_enum() const { return m_cudnn_enum; }
  256. bool is_cudnn() const override { return true; }
  257. MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONV)
  258. std::string param() const override {
  259. std::string ret;
  260. serialize_write_pod(m_cudnn_enum, ret);
  261. return ret;
  262. }
  263. private:
  264. std::string m_name;
  265. cudnnConvolutionFwdAlgo_t m_cudnn_enum;
  266. CudnnAlgoPack::Attr m_attr;
  267. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  268. };
  269. //! compute small matmul in the kernel
  270. class ConvBiasForwardImpl::AlgoInplaceMatmul final : public AlgoBase {
  271. public:
  272. bool is_available(const SizeArgs& args) const override;
  273. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  274. void exec(const ExecArgs& args) const override;
  275. const char* name() const override {
  276. if (m_name.empty()) {
  277. m_name = ConvBiasForward::algo_name<ConvBias::MatmulParam>(
  278. "INPLACE_MATMUL", {});
  279. }
  280. return m_name.c_str();
  281. }
  282. MEGDNN_DECL_ALGO_TYPE(CUDA_INPLACE_MATMUL)
  283. AlgoAttribute attribute() const override {
  284. return AlgoAttribute::REPRODUCIBLE;
  285. }
  286. private:
  287. mutable std::string m_name;
  288. };
  289. //! im2col and matmul, with dilation
  290. class ConvBiasForwardImpl::AlgoMatmul final : public AlgoBase {
  291. template <typename T>
  292. static void exec_internal(const ExecArgs& args,
  293. const WorkspaceBundle& bundle);
  294. public:
  295. bool is_available(const SizeArgs& args) const override;
  296. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  297. void exec(const ExecArgs& args) const override;
  298. const char* name() const override {
  299. if (m_name.empty()) {
  300. m_name = ConvBiasForward::algo_name<ConvBias::MatmulParam>("MATMUL",
  301. {});
  302. }
  303. return m_name.c_str();
  304. }
  305. std::vector<SearchItem> get_subopr_list(
  306. const TensorLayoutArray& layouts,
  307. const OperatorBase* opr) const override;
  308. MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
  309. AlgoAttribute attribute() const override {
  310. return AlgoAttribute::REPRODUCIBLE |
  311. AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
  312. }
  313. private:
  314. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  315. mutable std::string m_name;
  316. };
  317. class ConvBiasForwardImpl::AlgoMatmul8x8x32 final : public AlgoBase {
  318. public:
  319. bool is_available(const SizeArgs& args) const override;
  320. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  321. void exec(const ExecArgs& args) const override;
  322. const char* name() const override {
  323. if (m_name.empty()) {
  324. m_name = ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  325. "MATMUL8X8X32", {});
  326. }
  327. return m_name.c_str();
  328. }
  329. MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL_INT8X8X32)
  330. AlgoAttribute attribute() const override {
  331. return AlgoAttribute::REPRODUCIBLE;
  332. }
  333. private:
  334. bool need_src_unroll(const SizeArgs& args) const;
  335. bool need_filter_reshape(const SizeArgs& args) const;
  336. template <Param::Format>
  337. WorkspaceBundle get_bundle(const SizeArgs& args) const;
  338. template <Param::Format>
  339. void exec_internal(const ExecArgs& args) const;
  340. mutable std::string m_name;
  341. };
  342. class ConvBiasForwardImpl::AlgoBatchedMatmul final : public AlgoBase {
  343. public:
  344. bool is_available(const SizeArgs& args) const override;
  345. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  346. void exec(const ExecArgs& args) const override;
  347. const char* name() const override {
  348. if (m_name.empty()) {
  349. m_name = ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
  350. "BATCHED_MATMUL", {});
  351. }
  352. return m_name.c_str();
  353. }
  354. std::vector<SearchItem> get_subopr_list(
  355. const TensorLayoutArray& layouts,
  356. const OperatorBase* opr) const override;
  357. AlgoAttribute attribute() const override {
  358. return AlgoAttribute::REPRODUCIBLE |
  359. AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
  360. }
  361. MEGDNN_DECL_ALGO_TYPE(CUDA_BATCHED_MATMUL)
  362. private:
  363. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  364. mutable std::string m_name;
  365. };
  366. //! implement group conv by another algo
  367. class ConvBiasForwardImpl::AlgoGroupConvGeneral final : public AlgoBase {
  368. public:
  369. AlgoGroupConvGeneral(AlgoBase* impl);
  370. bool is_available(const SizeArgs& args) const override;
  371. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  372. void exec(const ExecArgs& args) const override;
  373. const char* name() const override { return m_name.c_str(); }
  374. AlgoAttribute attribute() const override {
  375. auto ret = AlgoAttribute::DEFAULT;
  376. #define cb(attr) \
  377. if (m_impl->contain_attribute_all(attr)) { \
  378. ret |= attr; \
  379. }
  380. MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb)
  381. #undef cb
  382. if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) {
  383. ret |= AlgoAttribute::REPRODUCIBLE;
  384. }
  385. return ret;
  386. }
  387. static void modify_size_args(SizeArgs& args, TensorLayout& src_pg,
  388. TensorLayout& dst_pg, TensorLayout& bias_pg);
  389. MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)
  390. private:
  391. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  392. AlgoBase* m_impl;
  393. std::string m_name;
  394. };
  395. #if CUDA_VERSION >= 10000
  396. class ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA final : public AlgoBase {
  397. public:
  398. AlgoQUInt4x4x32WMMA() = default;
  399. bool is_available(const SizeArgs& args) const override;
  400. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  401. void exec(const ExecArgs& args) const override;
  402. const char* name() const override { return "QUINT4x4x32_WMMA"; }
  403. AlgoAttribute attribute() const override {
  404. return AlgoAttribute::REPRODUCIBLE;
  405. }
  406. private:
  407. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  408. const SizeArgs& args) const;
  409. bool use_kernel_fhxfw(const SizeArgs& args) const;
  410. size_t get_workspace_in_bytes_do_conv(const SizeArgs& args) const;
  411. MEGDNN_DECL_ALGO_TYPE(CUDA_WMMA_UINT4X4X32)
  412. };
  413. #endif
  414. class ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm final
  415. : public AlgoBase {
  416. public:
  417. AlgoInt8CHWN4DotProdImplicitGemm() = default;
  418. bool is_available(const SizeArgs& args) const override;
  419. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  420. void exec(const ExecArgs& args) const override;
  421. const char* name() const override {
  422. return "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM";
  423. }
  424. AlgoAttribute attribute() const override {
  425. return AlgoAttribute::REPRODUCIBLE;
  426. }
  427. template <typename BiasVisitor>
  428. static void dispatch_nonlinear_mode(
  429. const int8_t* d_src, const int8_t* d_filter,
  430. BiasVisitor bias_visitor, const int8_t* d_z, int8_t* d_dst,
  431. const convolution::ConvParam& param, float alpha, float beta,
  432. float gamma, float scale, cudaStream_t stream,
  433. param::ConvBias::NonlineMode nonlinear_mode);
  434. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_CHWN4_DOTPROD_INT8)
  435. };
  436. class ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm final
  437. : public AlgoBase {
  438. public:
  439. struct AlgoParam {
  440. int threadblock_m;
  441. int threadblock_n;
  442. int threadblock_k;
  443. int warp_m;
  444. int warp_n;
  445. int warp_k;
  446. int stage;
  447. std::string to_string() {
  448. /// default algorithm
  449. if (threadblock_m == 128 && threadblock_n == 128 &&
  450. threadblock_k == 32 && warp_m == 32 && warp_n == 64 &&
  451. warp_k == 32 && stage == 2) {
  452. return "";
  453. }
  454. return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m,
  455. threadblock_n, threadblock_k, warp_m, warp_n,
  456. warp_k, stage);
  457. }
  458. };
  459. AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param)
  460. : m_algo_param{algo_param},
  461. m_name{ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
  462. m_algo_param.to_string().c_str())} {}
  463. bool is_available(const SizeArgs& args) const override;
  464. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  465. void exec(const ExecArgs& args) const override;
  466. const char* name() const override { return m_name.c_str(); }
  467. AlgoAttribute attribute() const override {
  468. return AlgoAttribute::REPRODUCIBLE;
  469. }
  470. size_t get_preprocess_workspace_in_bytes(
  471. const SizeArgs& args) const override;
  472. SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
  473. const SizeArgs& args) const override;
  474. void exec_preprocess(const ExecArgs& args) const override;
  475. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)
  476. std::string param() const override {
  477. std::string ret;
  478. serialize_write_pod(m_algo_param, ret);
  479. return ret;
  480. }
  481. private:
  482. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  483. const SizeArgs& args) const;
  484. AlgoParam m_algo_param;
  485. std::string m_name;
  486. };
  487. class ConvBiasForwardImpl::AlgoFallbackNCHWQS8 final : public AlgoBase {
  488. public:
  489. bool is_available(const SizeArgs& args) const override;
  490. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  491. void exec(const ExecArgs& args) const override;
  492. const char* name() const override {
  493. return "FALLBACK_CONV_NCHW_QS8";
  494. }
  495. AlgoAttribute attribute() const override {
  496. return AlgoAttribute::REPRODUCIBLE;
  497. }
  498. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)
  499. private:
  500. void make_inner_layout(const SizeArgs& args, TensorLayout& inner_src_layout,
  501. TensorLayout& inner_weight_layout,
  502. TensorLayout& inner_dst_layout,
  503. TensorLayout& inner_bias_layout,
  504. TensorLayout& inner_z_layout) const;
  505. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  506. };
  507. #if CUDA_VERSION >= 10000
  508. class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm final
  509. : public AlgoBase {
  510. public:
  511. enum class MMATileSize : uint32_t {
  512. IMMA16x16x16,
  513. IMMA32x8x16,
  514. IMMA8x32x16
  515. };
  516. AlgoInt8CHWN4IMMAImplicitGemm(MMATileSize mma_tile_size)
  517. : m_mma_tile_size{mma_tile_size},
  518. m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_" +
  519. to_string(m_mma_tile_size)} {}
  520. bool is_available(const SizeArgs& args) const override;
  521. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  522. void exec(const ExecArgs& args) const override;
  523. const char* name() const override { return m_name.c_str(); }
  524. AlgoAttribute attribute() const override {
  525. return AlgoAttribute::REPRODUCIBLE;
  526. }
  527. template <typename BiasVisitor>
  528. static void dispatch_nonlinear_mode(
  529. const int8_t* d_src, const int8_t* d_filter,
  530. BiasVisitor bias_visitor, int8_t* d_z, int8_t* d_dst,
  531. const convolution::ConvParam& param, float alpha, float beta,
  532. float gamma, float scale, cudaStream_t stream,
  533. param::ConvBias::NonlineMode nonlinear_mode,
  534. MMATileSize mma_tile_size);
  535. static std::string to_string(MMATileSize mma_tile_size);
  536. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_CHWN4_IMMA_INT8)
  537. std::string param() const override {
  538. std::string ret;
  539. serialize_write_pod(m_mma_tile_size, ret);
  540. return ret;
  541. }
  542. private:
  543. MMATileSize m_mma_tile_size;
  544. std::string m_name;
  545. };
  546. class ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm final
  547. : public AlgoBase {
  548. public:
  549. using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
  550. AlgoInt8NCHW4IMMAImplicitGemm(MMATileSize mma_tile_size)
  551. : m_mma_tile_size{mma_tile_size},
  552. m_name{"INT8_NCHW4_IMMA_IMPLICIT_GEMM_" +
  553. AlgoInt8CHWN4IMMAImplicitGemm::to_string(
  554. m_mma_tile_size)} {}
  555. bool is_available(const SizeArgs& args) const override;
  556. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  557. void exec(const ExecArgs& args) const override;
  558. const char* name() const override { return m_name.c_str(); }
  559. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_IMMA_INT8)
  560. std::string param() const override {
  561. std::string ret;
  562. serialize_write_pod(m_mma_tile_size, ret);
  563. return ret;
  564. }
  565. AlgoAttribute attribute() const override {
  566. return AlgoAttribute::REPRODUCIBLE;
  567. }
  568. private:
  569. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  570. const SizeArgs& args) const;
  571. MMATileSize m_mma_tile_size;
  572. std::string m_name;
  573. };
  574. class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter final
  575. : public AlgoBase {
  576. public:
  577. using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
  578. AlgoInt8CHWN4IMMAImplicitGemmReorderFilter(MMATileSize mma_tile_size)
  579. : m_mma_tile_size{mma_tile_size},
  580. m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_" +
  581. AlgoInt8CHWN4IMMAImplicitGemm::to_string(
  582. m_mma_tile_size)} {}
  583. bool is_available(const SizeArgs& args) const override;
  584. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  585. void exec(const ExecArgs& args) const override;
  586. const char* name() const override { return m_name.c_str(); }
  587. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8)
  588. std::string param() const override {
  589. std::string ret;
  590. serialize_write_pod(m_mma_tile_size, ret);
  591. return ret;
  592. }
  593. AlgoAttribute attribute() const override {
  594. return AlgoAttribute::REPRODUCIBLE;
  595. }
  596. private:
  597. MMATileSize m_mma_tile_size;
  598. std::string m_name;
  599. };
  600. class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth final
  601. : public AlgoBase {
  602. public:
  603. using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
  604. AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth(MMATileSize mma_tile_size)
  605. : m_mma_tile_size{mma_tile_size},
  606. m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_" +
  607. AlgoInt8CHWN4IMMAImplicitGemm::to_string(
  608. m_mma_tile_size)} {}
  609. bool is_available(const SizeArgs& args) const override;
  610. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  611. void exec(const ExecArgs& args) const override;
  612. const char* name() const override { return m_name.c_str(); }
  613. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8)
  614. std::string param() const override {
  615. std::string ret;
  616. serialize_write_pod(m_mma_tile_size, ret);
  617. return ret;
  618. }
  619. AlgoAttribute attribute() const override {
  620. return AlgoAttribute::REPRODUCIBLE;
  621. }
  622. private:
  623. MMATileSize m_mma_tile_size;
  624. std::string m_name;
  625. };
  626. #endif
  627. #if CUDA_VERSION >= 10020
  628. class ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm final
  629. : public AlgoBase {
  630. public:
  631. struct AlgoParam {
  632. int threadblock_m;
  633. int threadblock_n;
  634. int threadblock_k;
  635. int warp_m;
  636. int warp_n;
  637. int warp_k;
  638. };
  639. AlgoInt8NCHW32IMMAImplicitGemm(AlgoParam algo_param)
  640. : m_algo_param{algo_param} {
  641. m_name = ConvBias::algo_name<ConvBias::DirectParam>(
  642. ssprintf("INT8_NCHW32_IMMA_IMPLICIT_GEMM_%s",
  643. to_string(m_algo_param).c_str()),
  644. ConvBias::DirectParam{});
  645. }
  646. bool is_available(const SizeArgs& args) const override;
  647. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  648. void exec(const ExecArgs& args) const override;
  649. const char* name() const override { return m_name.c_str(); }
  650. AlgoAttribute attribute() const override {
  651. return AlgoAttribute::REPRODUCIBLE;
  652. }
  653. static std::string to_string(AlgoParam algo_param);
  654. size_t get_preprocess_workspace_in_bytes(
  655. const SizeArgs& args) const override;
  656. SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
  657. const SizeArgs& args) const override;
  658. void exec_preprocess(const ExecArgs& args) const override;
  659. MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8)
  660. std::string param() const override {
  661. std::string ret;
  662. serialize_write_pod(m_algo_param, ret);
  663. return ret;
  664. }
  665. private:
  666. WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
  667. const SizeArgs& args) const;
  668. AlgoParam m_algo_param;
  669. std::string m_name;
  670. };
  671. #endif
  672. class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase {
  673. public:
  674. bool is_available(const SizeArgs& args) const override;
  675. size_t get_workspace_in_bytes(const SizeArgs& args) const override;
  676. void exec(const ExecArgs& args) const override;
  677. std::vector<SearchItem> get_subopr_list(
  678. const TensorLayoutArray& layouts,
  679. const OperatorBase* opr) const override;
  680. const char* name() const override { return "CONVBIAS_BFLOAT16"; }
  681. AlgoAttribute attribute() const override {
  682. return AlgoAttribute::REPRODUCIBLE;
  683. }
  684. MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)
  685. private:
  686. WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
  687. };
  688. class ConvBiasForwardImpl::AlgoPack : NonCopyableObj {
  689. private:
  690. AlgoBase::Mapper m_all_algos_map;
  691. public:
  692. AlgoPack();
  693. std::vector<AlgoBase*> all_algos,
  694. //! non-cudnn algos, used for heuristic if cudnn is not supported
  695. non_cudnn_algos, bfloat16_algos;
  696. std::vector<AlgoCUDNNConvBiasActivation> cudnn_conv_bias_activations;
  697. std::vector<AlgoCUDNNConv> cudnn_convs;
  698. AlgoFallbackNCHWQS8 fallback_nchw_qs8;
  699. AlgoChanwise chanwise;
  700. AlgoChanwiseSmall chanwise_small;
  701. AlgoChanwise8x8x32 chanwise8x8x32;
  702. AlgoInplaceMatmul inplace_matmul;
  703. AlgoMatmul matmul;
  704. AlgoMatmul8x8x32 matmul8x8x32;
  705. AlgoBatchedMatmul batched_matmul;
  706. std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
  707. AlgoInt8CHWN4DotProdImplicitGemm int8_chwn4_dotprod;
  708. #if CUDA_VERSION >= 10000
  709. AlgoQUInt4x4x32WMMA wmma_quint4x4x32;
  710. std::vector<AlgoInt8CHWN4IMMAImplicitGemm> int8_chwn4_imma;
  711. std::vector<AlgoInt8NCHW4IMMAImplicitGemm> int8_nchw4_imma;
  712. std::vector<AlgoInt8CHWN4IMMAImplicitGemmReorderFilter>
  713. int8_chwn4_imma_reorder_filter;
  714. std::vector<AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth>
  715. int8_chwn4_imma_unroll_width;
  716. #endif
  717. #if CUDA_VERSION >= 10020
  718. std::vector<AlgoInt8NCHW32IMMAImplicitGemm> int8_nchw32_imma;
  719. #endif
  720. std::vector<std::unique_ptr<AlgoGroupConvGeneral>> gconv_refhold;
  721. AlgoBFloat16 bfloat16;
  722. std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv;
  723. AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo);
  724. AlgoBase* cudnn_conv_from_enum(cudnnConvolutionFwdAlgo_t algo);
  725. const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
  726. private:
  727. #if CUDA_VERSION >= 10000
  728. void fill_imma_algos();
  729. #endif
  730. void fill_cudnn_algos();
  731. void fill_dp4a_algos();
  732. };
  733. } // namespace cuda
  734. } // namespace megdnn
  735. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台