You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.cpp 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. /**
  2. * \file dnn/src/cuda/convolution/opr_impl.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "src/cuda/convolution/opr_impl.h"
  13. #include "megdnn/dtype.h"
  14. #include "src/common/algo_chooser.h"
  15. #include "src/cuda/convolution/helper.h"
  16. #include "src/cuda/convolution/forward/algos.h"
  17. #include "src/cuda/convolution/backward_data/algo.h"
  18. #include "src/cuda/convolution/backward_filter/algo.h"
  19. #include "src/cuda/conv_bias/opr_impl.h"
  20. #include "src/cuda/utils.h"
  21. using namespace megdnn;
  22. using namespace cuda;
  23. using namespace convolution;
  24. #define TO_STRING2(v) #v
  25. #define TO_STRING(v) TO_STRING2(v)
  26. #define CUDNN_VERSION_STR \
  27. TO_STRING(CUDNN_MAJOR) \
  28. "." TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL)
  29. /* ============== ConvolutionForwardImpl ============== */
  30. ConvolutionForwardImpl::Algorithm*
  31. ConvolutionForwardImpl::get_algorithm_heuristic(
  32. const TensorLayout& src, const TensorLayout& filter,
  33. const TensorLayout& dst, size_t workspace_limit_in_bytes,
  34. const AlgoAttribute& positive_attr,
  35. const AlgoAttribute& negative_attr) {
  36. AlgoBase::SizeArgs args{this, src, filter, dst};
  37. MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes);
  38. MEGDNN_MARK_USED_VAR(positive_attr);
  39. MEGDNN_MARK_USED_VAR(negative_attr);
  40. return &sm_algo_pack.algo_default;
  41. }
  42. std::vector<ConvolutionForwardImpl::Algorithm*>
  43. ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src,
  44. const TensorLayout& filter,
  45. const TensorLayout& dst) {
  46. AlgoBase::SizeArgs args{this, src, filter, dst};
  47. return megdnn::get_all_algorithms<ConvolutionForwardImpl>(args);
  48. }
  49. std::vector<ConvolutionForwardImpl::Algorithm*>
  50. ConvolutionForwardImpl::get_all_algorithms_safe(const TensorLayout& src,
  51. const TensorLayout& filter,
  52. const TensorLayout& dst) {
  53. AlgoBase::SizeArgs args{this, src, filter, dst};
  54. return megdnn::get_all_algorithms_safe<ConvolutionForwardImpl>(args);
  55. }
  56. size_t ConvolutionForwardImpl::get_workspace_in_bytes(
  57. const TensorLayout& src, const TensorLayout& filter,
  58. const TensorLayout& dst,
  59. const PreprocessedFilter* preprocessed_filter) {
  60. MEGDNN_MARK_USED_VAR(preprocessed_filter);
  61. return get_dnn_workspace(this, src, filter, dst);
  62. }
  63. void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
  64. _megdnn_tensor_in filter,
  65. _megdnn_tensor_out dst,
  66. const PreprocessedFilter* preprocessed_filter,
  67. _megdnn_workspace workspace) {
  68. check_exec(src.layout, filter.layout, dst.layout, workspace.size,
  69. preprocessed_filter);
  70. AlgoBase::ExecArgs args(this, src, filter, dst, workspace);
  71. auto&& algo = get_algorithm(this, src.layout, filter.layout, dst.layout);
  72. algo->exec(args);
  73. }
  74. const char* ConvolutionForwardImpl::get_algorithm_set_name() const {
  75. return "CUDA CONVOLUTION_FORWARD";
  76. }
  77. /* ============== ConvolutionBackwardDataImpl ============== */
  78. void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter,
  79. _megdnn_tensor_in diff,
  80. _megdnn_tensor_out grad,
  81. _megdnn_workspace workspace) {
  82. check_exec(filter.layout, diff.layout, grad.layout, workspace.size);
  83. AlgoBase::ExecArgs args(this, filter, diff, grad, workspace);
  84. auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout);
  85. algo->exec(args);
  86. }
  87. std::vector<ConvolutionBackwardDataImpl::Algorithm*>
  88. ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout& filter,
  89. const TensorLayout& diff,
  90. const TensorLayout& grad) {
  91. return megdnn::get_all_algorithms<ConvolutionBackwardDataImpl>(
  92. {this, filter, diff, grad});
  93. }
  94. std::vector<ConvolutionBackwardDataImpl::Algorithm*>
  95. ConvolutionBackwardDataImpl::get_all_algorithms_safe(const TensorLayout& filter,
  96. const TensorLayout& diff,
  97. const TensorLayout& grad) {
  98. return megdnn::get_all_algorithms_safe<ConvolutionBackwardDataImpl>(
  99. {this, filter, diff, grad});
  100. }
  101. ConvolutionBackwardDataImpl::Algorithm*
  102. ConvolutionBackwardDataImpl::get_algorithm_heuristic(
  103. const TensorLayout& filter, const TensorLayout& diff,
  104. const TensorLayout& grad, size_t workspace_limit_in_bytes,
  105. const AlgoAttribute& positive_attr,
  106. const AlgoAttribute& negative_attr) {
  107. AlgoBase::SizeArgs args(this, filter, diff, grad);
  108. if (args.filter_meta.group > 1 &&
  109. sm_algo_pack.chanwise.is_available_attribute(
  110. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  111. // prefer special chanwise impl
  112. return &sm_algo_pack.chanwise;
  113. }
  114. if (args.filter_layout->dtype.enumv() ==
  115. DTypeTrait<dtype::QuantizedS8>::enumv) {
  116. return megdnn::get_algo_match_attribute<ConvolutionBackwardDataImpl>(
  117. sm_algo_pack.int8_algos, args, workspace_limit_in_bytes,
  118. "cuda conv bwd_data", positive_attr, negative_attr);
  119. }
  120. auto get_cudnn_algo =
  121. [this, &args, workspace_limit_in_bytes, positive_attr,
  122. negative_attr]() -> ConvolutionBackwardDataImpl::AlgoBase* {
  123. auto cudnn_handle = cuda::cudnn_handle(this->handle());
  124. CUDNNBwdDataDescs desc;
  125. args.init_desc(desc);
  126. #if CUDNN_MAJOR >= 7
  127. MEGDNN_MARK_USED_VAR(negative_attr);
  128. auto& cudnn = args.handle->cudnn();
  129. int max_count = 0;
  130. cudnn_check(cudnn.GetConvolutionBackwardDataAlgorithmMaxCount(
  131. cudnn_handle, &max_count));
  132. SmallVector<cudnnConvolutionBwdDataAlgoPerf_t> algo_perf(max_count);
  133. int ret_count = 0;
  134. cudnn_check(cudnn.GetConvolutionBackwardDataAlgorithm_v7(
  135. cudnn_handle, desc.filter_desc.desc, desc.diff_desc.desc,
  136. desc.conv_desc.desc, desc.grad_desc.desc, max_count, &ret_count,
  137. algo_perf.data()));
  138. for (int i = 0; i < ret_count; ++i) {
  139. if (algo_perf[i].memory > workspace_limit_in_bytes)
  140. continue;
  141. if ((positive_attr & AlgoAttribute::REPRODUCIBLE) &&
  142. (algo_perf[i].determinism != CUDNN_DETERMINISTIC)) {
  143. continue;
  144. }
  145. AlgoBase* conv_bd_data_algo = reinterpret_cast<AlgoBase*>(
  146. sm_algo_pack.cudnn_from_enum(algo_perf[i].algo));
  147. if (conv_bd_data_algo->is_available_attribute(
  148. args, positive_attr, negative_attr,
  149. workspace_limit_in_bytes)) {
  150. return conv_bd_data_algo;
  151. }
  152. }
  153. return nullptr;
  154. #else
  155. cudnnConvolutionBwdDataAlgo_t algo;
  156. cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm(
  157. cudnn_handle, desc.filter_desc.desc, desc.diff_desc.desc,
  158. desc.conv_desc.desc, desc.grad_desc.desc,
  159. CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
  160. workspace_limit_in_bytes, &algo));
  161. auto&& cast_algo =
  162. reinterpret_cast<AlgoBase*>(sm_algo_pack.cudnn_from_enum(algo));
  163. return reinterpret_cast<AlgoBase*>(
  164. megdnn::get_algo_match_attribute<ConvolutionBackwardDataImpl>(
  165. cast_algo, positive_attr, negative_attr));
  166. #endif
  167. };
  168. if (is_cudnn_supported(args.as_fwd_args())) {
  169. if (auto algo = get_cudnn_algo())
  170. return algo;
  171. }
  172. if (args.filter_meta.group > 1 &&
  173. sm_algo_pack.group.is_available_attribute(
  174. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  175. return &sm_algo_pack.group;
  176. }
  177. if (args.filter_layout->dtype.enumv() !=
  178. DTypeTrait<dtype::BFloat16>::enumv) {
  179. return megdnn::get_algo_match_attribute<ConvolutionBackwardDataImpl>(
  180. sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes,
  181. "cuda conv bwd_data", positive_attr, negative_attr);
  182. } else {
  183. return megdnn::get_algo_match_attribute<ConvolutionBackwardDataImpl>(
  184. sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes,
  185. "cuda conv bwd_data", positive_attr, negative_attr);
  186. }
  187. }
  188. size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(
  189. const TensorLayout& filter, const TensorLayout& diff,
  190. const TensorLayout& grad) {
  191. return get_dnn_workspace(this, filter, diff, grad);
  192. }
  193. const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const {
  194. return "CUDACONV0+CUDNN" CUDNN_VERSION_STR;
  195. }
  196. /* ============== ConvolutionBackwardFilterImpl ============== */
  197. void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src,
  198. _megdnn_tensor_in diff,
  199. _megdnn_tensor_out grad,
  200. _megdnn_workspace workspace) {
  201. check_exec(src.layout, diff.layout, grad.layout, workspace.size);
  202. AlgoBase::ExecArgs args(this, src, diff, grad, workspace);
  203. auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout);
  204. algo->exec(args);
  205. }
  206. std::vector<ConvolutionBackwardFilterImpl::Algorithm*>
  207. ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout& src,
  208. const TensorLayout& diff,
  209. const TensorLayout& grad) {
  210. return megdnn::get_all_algorithms<ConvolutionBackwardFilterImpl>(
  211. {this, src, diff, grad});
  212. }
  213. std::vector<ConvolutionBackwardFilterImpl::Algorithm*>
  214. ConvolutionBackwardFilterImpl::get_all_algorithms_safe(const TensorLayout& src,
  215. const TensorLayout& diff,
  216. const TensorLayout& grad) {
  217. return megdnn::get_all_algorithms_safe<ConvolutionBackwardFilterImpl>(
  218. {this, src, diff, grad});
  219. }
  220. ConvolutionBackwardFilterImpl::Algorithm*
  221. ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
  222. const TensorLayout& src, const TensorLayout& diff,
  223. const TensorLayout& grad, size_t workspace_limit_in_bytes,
  224. const AlgoAttribute& positive_attr,
  225. const AlgoAttribute& negative_attr) {
  226. AlgoBase::SizeArgs args(this, src, diff, grad);
  227. if (args.grad_filter_meta.group > 1 &&
  228. sm_algo_pack.chanwise.is_available_attribute(
  229. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  230. // prefer special chanwise impl
  231. return &sm_algo_pack.chanwise;
  232. }
  233. auto get_cudnn_algo =
  234. [this, &args, workspace_limit_in_bytes, positive_attr,
  235. negative_attr]() -> ConvolutionBackwardFilterImpl::AlgoBase* {
  236. auto cudnn_handle = cuda::cudnn_handle(this->handle());
  237. CUDNNBwdFilterDescs desc;
  238. args.init_desc(desc);
  239. // disable, segfault in megbrain, need further investigate.
  240. #if 0
  241. auto is_heuristic_success =
  242. convolution::PerformanceModelBackwardFilter::
  243. get_algo_backward_filter_success(
  244. args, desc, workspace_limit_in_bytes, &algo);
  245. if (is_heuristic_success) {
  246. return sm_algo_pack.cudnn_from_enum(algo);
  247. }
  248. #endif
  249. #if CUDNN_MAJOR >= 7
  250. MEGDNN_MARK_USED_VAR(negative_attr);
  251. auto& cudnn = args.handle->cudnn();
  252. int max_count = 0;
  253. cudnn_check(cudnn.GetConvolutionBackwardFilterAlgorithmMaxCount(
  254. cudnn_handle, &max_count));
  255. SmallVector<cudnnConvolutionBwdFilterAlgoPerf_t> algo_perf(max_count);
  256. int ret_count = 0;
  257. cudnn_check(cudnn.GetConvolutionBackwardFilterAlgorithm_v7(
  258. cudnn_handle, desc.src_desc.desc, desc.diff_desc.desc,
  259. desc.conv_desc.desc, desc.grad_desc.desc, max_count, &ret_count,
  260. algo_perf.data()));
  261. for (int i = 0; i < ret_count; ++i) {
  262. if (algo_perf[i].memory > workspace_limit_in_bytes)
  263. continue;
  264. if ((positive_attr & AlgoAttribute::REPRODUCIBLE) &&
  265. (algo_perf[i].determinism != CUDNN_DETERMINISTIC)) {
  266. continue;
  267. }
  268. AlgoBase* conv_bd_filter_algo = reinterpret_cast<AlgoBase*>(
  269. sm_algo_pack.cudnn_from_enum(algo_perf[i].algo));
  270. if (conv_bd_filter_algo->is_available_attribute(
  271. args, positive_attr, negative_attr,
  272. workspace_limit_in_bytes)) {
  273. return conv_bd_filter_algo;
  274. }
  275. }
  276. return nullptr;
  277. #else
  278. cudnnConvolutionBwdFilterAlgo_t algo;
  279. cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithm(
  280. cudnn_handle, desc.src_desc.desc, desc.diff_desc.desc,
  281. desc.conv_desc.desc, desc.grad_desc.desc,
  282. CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
  283. workspace_limit_in_bytes, &algo));
  284. auto&& cast_algo =
  285. reinterpret_cast<AlgoBase*>(sm_algo_pack.cudnn_from_enum(algo));
  286. return reinterpret_cast<AlgoBase*>(
  287. megdnn::get_algo_match_attribute<ConvolutionBackwardFilterImpl>(
  288. cast_algo, positive_attr, negative_attr));
  289. #endif
  290. };
  291. if (is_cudnn_supported(args.as_fwd_args())) {
  292. if (auto algo = get_cudnn_algo())
  293. return algo;
  294. }
  295. if (args.grad_filter_meta.group > 1 &&
  296. sm_algo_pack.group.is_available_attribute(
  297. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  298. return &sm_algo_pack.group;
  299. }
  300. if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) {
  301. return megdnn::get_algo_match_attribute<ConvolutionBackwardFilterImpl>(
  302. sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes,
  303. "cuda conv bwd_filter", positive_attr, negative_attr);
  304. } else {
  305. return megdnn::get_algo_match_attribute<ConvolutionBackwardFilterImpl>(
  306. sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes,
  307. "cuda conv bwd_filter", positive_attr, negative_attr);
  308. }
  309. }
  310. size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes(
  311. const TensorLayout& src, const TensorLayout& diff,
  312. const TensorLayout& grad) {
  313. return get_dnn_workspace(this, src, diff, grad);
  314. }
  315. const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const {
  316. return "CUDACONV0+CUDNN" CUDNN_VERSION_STR;
  317. }
  318. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台