You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.cpp 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
  1. /**
  2. * \file dnn/src/cuda/conv_bias/opr_impl.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megdnn/dtype.h"
  13. #include "src/cuda/conv_bias/algo.h"
  14. #include "src/cuda/conv_bias/helper.h"
  15. #include "src/cuda/conv_bias/opr_impl.h"
  16. #include "src/cuda/handle.h"
  17. #include "src/cuda/utils.h"
  18. #include "src/common/algo_chooser.h"
  19. #include "src/cuda/cudnn_with_check.h"
  20. namespace megdnn {
  21. namespace cuda {
  22. void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter,
  23. _megdnn_tensor_in bias, _megdnn_tensor_in z,
  24. _megdnn_tensor_out dst,
  25. const PreprocessedFilter* preprocessed_filter,
  26. _megdnn_workspace workspace) {
  27. check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout,
  28. workspace.size, preprocessed_filter);
  29. AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace,
  30. preprocessed_filter);
  31. auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout,
  32. z.layout, dst.layout);
  33. algo->check_workspace(args, workspace).exec(args);
  34. };
  35. std::vector<ConvBiasForward::Algorithm*>
  36. ConvBiasForwardImpl::get_all_algorithms(const TensorLayout& src,
  37. const TensorLayout& filter,
  38. const TensorLayout& bias,
  39. const TensorLayout& z,
  40. const TensorLayout& dst) {
  41. return megdnn::get_all_algorithms<ConvBiasForwardImpl>(
  42. {this, src, filter, bias, z, dst});
  43. }
  44. ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
  45. const TensorLayout& src, const TensorLayout& filter,
  46. const TensorLayout& bias, const TensorLayout& z,
  47. const TensorLayout& dst, size_t workspace_limit_in_bytes,
  48. const AlgoAttribute& positive_attr,
  49. const AlgoAttribute& negative_attr) {
  50. using namespace conv_bias;
  51. AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
  52. auto dst_layout = *args.dst_layout;
  53. if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
  54. dst_layout.dtype = DType();
  55. args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype,
  56. args.filter_layout->dtype,
  57. dst_layout.dtype);
  58. }
  59. auto conv_args = args;
  60. auto cudnn_conv_bias_act_from_enum_wrapper =
  61. [](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* {
  62. return sm_algo_pack.cudnn_conv_bias_act_from_enum(algo);
  63. };
  64. auto cudnn_conv_from_enum_wrapper =
  65. [](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* {
  66. return sm_algo_pack.cudnn_conv_from_enum(algo);
  67. };
  68. auto get_cudnn_algo =
  69. [this, &conv_args, &args, workspace_limit_in_bytes, positive_attr,
  70. negative_attr](
  71. const thin_function<AlgoBase*(cudnnConvolutionFwdAlgo_t)>&
  72. cb) -> AlgoBase* {
  73. auto cudnn_handle = cuda::cudnn_handle(this->handle());
  74. CUDNNForwardDescs desc;
  75. conv_args.init_conv_desc(desc);
  76. #if CUDNN_MAJOR >= 7
  77. int max_count = 0;
  78. cudnn_check(cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle,
  79. &max_count));
  80. SmallVector<cudnnConvolutionFwdAlgoPerf_t> algo_perf(max_count);
  81. int ret_count = 0;
  82. cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(
  83. cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc,
  84. desc.conv_desc.conv_desc, desc.dst_desc.desc, max_count,
  85. &ret_count, algo_perf.data()));
  86. for (int i = 0; i < ret_count; ++i) {
  87. auto conv_bias_algo = cb(algo_perf[i].algo);
  88. if (conv_bias_algo->is_available_attribute(
  89. args, positive_attr, negative_attr,
  90. workspace_limit_in_bytes))
  91. return conv_bias_algo;
  92. }
  93. #else
  94. cudnnConvolutionFwdAlgo_t algo;
  95. cudnn_check(cudnnGetConvolutionForwardAlgorithm(
  96. cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc,
  97. desc.conv_desc.conv_desc, desc.dst_desc.desc,
  98. CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
  99. workspace_limit_in_bytes, &algo));
  100. auto conv_bias_algo = cb(algo);
  101. if (conv_bias_algo->is_available_attribute(args, positive_attr,
  102. negative_attr,
  103. workspace_limit_in_bytes))
  104. return conv_bias_algo;
  105. #endif
  106. return nullptr;
  107. };
  108. auto get_1x1_algo = [workspace_limit_in_bytes, positive_attr,
  109. negative_attr](const AlgoBase::SizeArgs& size_arg)
  110. -> ConvBiasForwardImpl::AlgoBase* {
  111. if (sm_algo_pack.batched_matmul.is_available_attribute(
  112. size_arg, positive_attr, negative_attr,
  113. workspace_limit_in_bytes)) {
  114. return &sm_algo_pack.batched_matmul;
  115. }
  116. return nullptr;
  117. };
  118. const bool is_chanwise =
  119. (args.filter_meta.format == Param::Format::NCHW &&
  120. args.filter_meta.group == src[1]) ||
  121. (args.filter_meta.format == Param::Format::NCHW4 &&
  122. args.filter_meta.group == src[1] * 4) ||
  123. (args.filter_meta.format == Param::Format::NCHW32 &&
  124. args.filter_meta.group == src[1] * 32);
  125. // prefer special chanwise impl since as the group conv of cudnn
  126. // whose version is lower than v7.5.0 is still slower than our
  127. // implementation in many channel-wise cases
  128. const bool slow_cudnn_chanwise_impl =
  129. CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5);
  130. //! choose CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM default for large image
  131. const int hw_size = src[2] * src[3];
  132. //! choose dnn when stride != 1, may need calibrate for different cudnn
  133. //! version
  134. const bool prefer_dnn_chanwise =
  135. slow_cudnn_chanwise_impl || args.filter_meta.stride[0] != 1 ||
  136. args.filter_meta.stride[1] != 1 || hw_size < 512;
  137. //! avoid bad case in cudnn, check dnn chanwise impl first
  138. if (is_chanwise) {
  139. if (prefer_dnn_chanwise) {
  140. if (sm_algo_pack.chanwise.is_available_attribute(
  141. args, positive_attr, negative_attr,
  142. workspace_limit_in_bytes))
  143. return &sm_algo_pack.chanwise;
  144. if (sm_algo_pack.chanwise8x8x32.is_available_attribute(
  145. args, positive_attr, negative_attr,
  146. workspace_limit_in_bytes))
  147. return &sm_algo_pack.chanwise8x8x32;
  148. } else {
  149. conv_args.dst_layout = &dst_layout;
  150. if (is_cudnn_supported(conv_args)) {
  151. if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) {
  152. return algo;
  153. }
  154. }
  155. }
  156. }
  157. //! Prefer CUDNN CONVBIAS.
  158. bool cudnn_conv_bias_act_supported = false;
  159. for (auto&& algo : sm_algo_pack.cudnn_conv_bias_activations) {
  160. if (algo.is_available_attribute(args, positive_attr, negative_attr,
  161. workspace_limit_in_bytes)) {
  162. cudnn_conv_bias_act_supported = true;
  163. break;
  164. }
  165. }
  166. if (cudnn_conv_bias_act_supported) {
  167. if (auto algo = get_cudnn_algo(cudnn_conv_bias_act_from_enum_wrapper))
  168. return algo;
  169. }
  170. // modify conv_args dst_layout
  171. conv_args.dst_layout = &dst_layout;
  172. if (is_cudnn_supported(conv_args)) {
  173. if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper))
  174. return algo;
  175. }
  176. if (args.filter_meta.group > 1) {
  177. auto orig_args = conv_args;
  178. TensorLayout src, dst, bias;
  179. AlgoGroupConvGeneral::modify_size_args(conv_args, src, dst, bias);
  180. if (auto algo = get_1x1_algo(conv_args)) {
  181. return sm_algo_pack.algo2gconv.at(algo);
  182. }
  183. if (is_cudnn_supported(conv_args)) {
  184. if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) {
  185. return sm_algo_pack.algo2gconv.at(algo);
  186. }
  187. }
  188. conv_args = orig_args;
  189. }
  190. if (auto algo = get_1x1_algo(args)) {
  191. return algo;
  192. }
  193. if (sm_algo_pack.fallback_nchw_qs8.is_available_attribute(
  194. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  195. return &sm_algo_pack.fallback_nchw_qs8;
  196. }
  197. if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) {
  198. return megdnn::get_algo_match_attribute<ConvBiasForwardImpl>(
  199. sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes,
  200. "cuda convbias fwd", positive_attr, negative_attr);
  201. } else {
  202. return megdnn::get_algo_match_attribute<ConvBiasForwardImpl>(
  203. sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes,
  204. "cuda convbias fwd", positive_attr, negative_attr);
  205. }
  206. }
  207. const char* ConvBiasForwardImpl::get_algorithm_set_name() const {
  208. return "CONV_BIAS_CUDA";
  209. }
  210. size_t ConvBiasForwardImpl::get_workspace_in_bytes(
  211. const TensorLayout& src, const TensorLayout& filter,
  212. const TensorLayout& bias, const TensorLayout& z,
  213. const TensorLayout& dst,
  214. const PreprocessedFilter* preprocessed_filter) {
  215. AlgoBase::SizeArgs args{
  216. this, src, filter, bias, z, dst, preprocessed_filter};
  217. return get_algorithm(this, src, filter, bias, z, dst)
  218. ->get_workspace_in_bytes(args);
  219. };
  220. size_t ConvBiasForwardImpl::get_preprocess_workspace_in_bytes(
  221. const TensorLayout& src, const TensorLayout& filter,
  222. const TensorLayout& bias, const TensorLayout& z,
  223. const TensorLayout& dst) {
  224. AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
  225. return get_algorithm(this, src, filter, bias, z, dst)
  226. ->get_preprocess_workspace_in_bytes(args);
  227. }
  228. SmallVector<TensorLayout>
  229. ConvBiasForwardImpl::deduce_preprocessed_filter_layout(
  230. const TensorLayout& src, const TensorLayout& filter,
  231. const TensorLayout& bias, const TensorLayout& z,
  232. const TensorLayout& dst) {
  233. AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
  234. return get_algorithm(this, src, filter, bias, z, dst)
  235. ->deduce_preprocessed_filter_layout(args);
  236. }
  237. void ConvBiasForwardImpl::exec_preprocess(
  238. const TensorLayout& src_layout, _megdnn_tensor_in filter,
  239. _megdnn_tensor_in bias, const TensorLayout& z_layout,
  240. const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter,
  241. _megdnn_workspace workspace) {
  242. TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout},
  243. z{nullptr, z_layout};
  244. AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace,
  245. preprocessed_filter);
  246. auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout,
  247. z.layout, dst.layout);
  248. return algo->exec_preprocess(args);
  249. }
  250. } // namespace cuda
  251. } // namespace megdnn
  252. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台