You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.cpp 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. #include "src/cuda/conv_bias/opr_impl.h"
  2. #include "megdnn/dtype.h"
  3. #include "src/cuda/conv_bias/algo.h"
  4. #include "src/cuda/conv_bias/helper.h"
  5. #include "src/cuda/handle.h"
  6. #include "src/cuda/utils.h"
  7. #include "src/common/algo_chooser.h"
  8. #include "src/common/conv_bias.h"
  9. #include "src/cuda/cudnn_with_check.h"
  10. namespace megdnn {
  11. namespace cuda {
  12. void ConvBiasForwardImpl::exec(
  13. _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias,
  14. _megdnn_tensor_in z, _megdnn_tensor_out dst,
  15. const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) {
  16. check_exec_allow_noncontiguous(
  17. src.layout, filter.layout, bias.layout, z.layout, dst.layout,
  18. workspace.size, preprocessed_filter);
  19. AlgoBase::ExecArgs args(
  20. this, src, filter, bias, z, dst, workspace, preprocessed_filter);
  21. auto algo = get_algorithm(
  22. this, src.layout, filter.layout, bias.layout, z.layout, dst.layout);
  23. algo->exec(args);
  24. };
  25. std::vector<ConvBiasForward::Algorithm*> ConvBiasForwardImpl::get_all_algorithms(
  26. const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
  27. const TensorLayout& z, const TensorLayout& dst) {
  28. return megdnn::get_all_algorithms<ConvBiasForwardImpl>(
  29. {this, src, filter, bias, z, dst});
  30. }
  31. std::vector<ConvBiasForward::Algorithm*> ConvBiasForwardImpl::get_all_algorithms_safe(
  32. const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
  33. const TensorLayout& z, const TensorLayout& dst) {
  34. return megdnn::get_all_algorithms_safe<ConvBiasForwardImpl>(
  35. {this, src, filter, bias, z, dst});
  36. }
  37. ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
  38. const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
  39. const TensorLayout& z, const TensorLayout& dst, size_t workspace_limit_in_bytes,
  40. const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) {
  41. using namespace conv_bias;
  42. AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
  43. #if CUDNN_VERSION >= 8004
  44. if (sm_algo_pack.cudnn_conv_v8.is_available_attribute(
  45. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  46. return &sm_algo_pack.cudnn_conv_v8;
  47. }
  48. if (sm_algo_pack.cudnn_conv_bias_activation_v8.is_available_attribute(
  49. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  50. return &sm_algo_pack.cudnn_conv_bias_activation_v8;
  51. }
  52. #endif
  53. auto dst_layout = *args.dst_layout;
  54. if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
  55. dst_layout.dtype = DType();
  56. args.opr->check_or_deduce_dtype_fwd(
  57. args.src_layout->dtype, args.filter_layout->dtype, dst_layout.dtype);
  58. }
  59. auto conv_args = args;
  60. auto cudnn_conv_bias_act_from_enum_wrapper =
  61. [](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* {
  62. return sm_algo_pack.cudnn_conv_bias_act_from_enum(algo);
  63. };
  64. auto cudnn_conv_from_enum_wrapper =
  65. [](cudnnConvolutionFwdAlgo_t algo) -> AlgoBase* {
  66. return sm_algo_pack.cudnn_conv_from_enum(algo);
  67. };
  68. auto get_cudnn_algo =
  69. [this, &conv_args, &args, workspace_limit_in_bytes, positive_attr,
  70. negative_attr](
  71. const thin_function<AlgoBase*(cudnnConvolutionFwdAlgo_t)>& cb)
  72. -> AlgoBase* {
  73. auto cudnn_handle = cuda::cudnn_handle(this->handle());
  74. CUDNNForwardDescs desc;
  75. conv_args.init_conv_desc(desc);
  76. #if CUDNN_MAJOR >= 7
  77. int max_count = 0;
  78. cudnn_check(
  79. cudnnGetConvolutionForwardAlgorithmMaxCount(cudnn_handle, &max_count));
  80. SmallVector<cudnnConvolutionFwdAlgoPerf_t> algo_perf(max_count);
  81. int ret_count = 0;
  82. cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(
  83. cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc,
  84. desc.conv_desc.conv_desc, desc.dst_desc.desc, max_count, &ret_count,
  85. algo_perf.data()));
  86. for (int i = 0; i < ret_count; ++i) {
  87. auto conv_bias_algo = cb(algo_perf[i].algo);
  88. if (conv_bias_algo->is_available_attribute(
  89. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  90. return conv_bias_algo;
  91. }
  92. }
  93. #else
  94. cudnnConvolutionFwdAlgo_t algo;
  95. cudnn_check(cudnnGetConvolutionForwardAlgorithm(
  96. cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc,
  97. desc.conv_desc.conv_desc, desc.dst_desc.desc,
  98. CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_limit_in_bytes,
  99. &algo));
  100. auto conv_bias_algo = cb(algo);
  101. if (conv_bias_algo->is_available_attribute(
  102. args, positive_attr, negative_attr, workspace_limit_in_bytes))
  103. return conv_bias_algo;
  104. #endif
  105. return nullptr;
  106. };
  107. auto get_1x1_algo = [workspace_limit_in_bytes, positive_attr,
  108. negative_attr](const AlgoBase::SizeArgs& size_arg)
  109. -> ConvBiasForwardImpl::AlgoBase* {
  110. if (sm_algo_pack.batched_matmul.is_available_attribute(
  111. size_arg, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  112. return &sm_algo_pack.batched_matmul;
  113. }
  114. return nullptr;
  115. };
  116. const bool is_chanwise = (args.filter_meta.format == Param::Format::NCHW &&
  117. args.filter_meta.group == src[1]) ||
  118. (args.filter_meta.format == Param::Format::NCHW4 &&
  119. args.filter_meta.group == src[1] * 4) ||
  120. (args.filter_meta.format == Param::Format::NCHW32 &&
  121. args.filter_meta.group == src[1] * 32);
  122. // prefer special chanwise impl since as the group conv of cudnn
  123. // whose version is lower than v7.5.0 is still slower than our
  124. // implementation in many channel-wise cases
  125. const bool slow_cudnn_chanwise_impl =
  126. CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5);
  127. //! choose CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM default for large image
  128. const int hw_size = src[2] * src[3];
  129. //! choose dnn when stride != 1, may need calibrate for different cudnn
  130. //! version
  131. const bool prefer_dnn_chanwise = slow_cudnn_chanwise_impl ||
  132. args.filter_meta.stride[0] != 1 ||
  133. args.filter_meta.stride[1] != 1 || hw_size < 512;
  134. //! choose for large kernel cases
  135. size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1];
  136. size_t hi = src[2], wi = src[3];
  137. const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw;
  138. //! filter size > 9, choose large kernel cases
  139. const bool prefer_direct_lk = fh > 9 && fw > 9;
  140. //! avoid bad case in cudnn, check dnn chanwise impl first
  141. if (is_chanwise) {
  142. if (prefer_dnn_lk_implbmm) {
  143. #if CUDA_VERSION >= 10020
  144. if (sm_algo_pack.f16_implicit_bmm[0].is_available_attribute(
  145. args, positive_attr, negative_attr, workspace_limit_in_bytes))
  146. return &sm_algo_pack.f16_implicit_bmm[0];
  147. #endif
  148. if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute(
  149. args, positive_attr, negative_attr, workspace_limit_in_bytes))
  150. return &sm_algo_pack.f32_implicit_bmm[0];
  151. } else if (
  152. prefer_direct_lk &&
  153. sm_algo_pack.depthwise_large_filter.is_available_attribute(
  154. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  155. return &sm_algo_pack.depthwise_large_filter;
  156. } else if (prefer_dnn_chanwise) {
  157. if (sm_algo_pack.chanwise.is_available_attribute(
  158. args, positive_attr, negative_attr, workspace_limit_in_bytes))
  159. return &sm_algo_pack.chanwise;
  160. if (sm_algo_pack.chanwise8x8x32.is_available_attribute(
  161. args, positive_attr, negative_attr, workspace_limit_in_bytes))
  162. return &sm_algo_pack.chanwise8x8x32;
  163. } else {
  164. conv_args.dst_layout = &dst_layout;
  165. if (is_cudnn_supported(conv_args)) {
  166. if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) {
  167. return algo;
  168. }
  169. }
  170. }
  171. }
  172. //! Prefer CUDNN CONVBIAS.
  173. bool cudnn_conv_bias_act_supported = false;
  174. for (auto&& algo : sm_algo_pack.cudnn_conv_bias_activations) {
  175. if (algo.is_available_attribute(
  176. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  177. cudnn_conv_bias_act_supported = true;
  178. break;
  179. }
  180. }
  181. if (cudnn_conv_bias_act_supported) {
  182. if (auto algo = get_cudnn_algo(cudnn_conv_bias_act_from_enum_wrapper))
  183. return algo;
  184. }
  185. // modify conv_args dst_layout
  186. conv_args.dst_layout = &dst_layout;
  187. if (is_cudnn_supported(conv_args)) {
  188. if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper))
  189. return algo;
  190. }
  191. if (auto algo = get_1x1_algo(args)) {
  192. return algo;
  193. }
  194. if (args.filter_meta.group > 1 &&
  195. sm_algo_pack.group.is_available_attribute(
  196. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  197. return &sm_algo_pack.group;
  198. }
  199. if (sm_algo_pack.fallback_nchw_qs8.is_available_attribute(
  200. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  201. return &sm_algo_pack.fallback_nchw_qs8;
  202. }
  203. if (sm_algo_pack.int1_simple.is_available_attribute(
  204. args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
  205. return &sm_algo_pack.int1_simple;
  206. }
  207. if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) {
  208. return megdnn::get_algo_match_attribute<ConvBiasForwardImpl>(
  209. sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes,
  210. "cuda convbias fwd", positive_attr, negative_attr);
  211. } else {
  212. return megdnn::get_algo_match_attribute<ConvBiasForwardImpl>(
  213. sm_algo_pack.bfloat16_algos, args, workspace_limit_in_bytes,
  214. "cuda convbias fwd", positive_attr, negative_attr);
  215. }
  216. }
  217. const char* ConvBiasForwardImpl::get_algorithm_set_name() const {
  218. return "CONV_BIAS_CUDA";
  219. }
  220. size_t ConvBiasForwardImpl::get_workspace_in_bytes(
  221. const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
  222. const TensorLayout& z, const TensorLayout& dst,
  223. const PreprocessedFilter* preprocessed_filter) {
  224. TensorLayoutArray layouts{src, filter, bias, z, dst};
  225. AlgorithmCache::Key key{this->handle(), this->get_opr_type(),
  226. layouts.data(), layouts.size(),
  227. &this->param(), sizeof(this->param())};
  228. auto rst = AlgorithmCache::instance().get(key);
  229. if (rst.policy.algo.valid()) {
  230. return rst.workspace;
  231. }
  232. AlgoBase::SizeArgs args{this, src, filter, bias, z, dst, preprocessed_filter};
  233. return get_algorithm(this, src, filter, bias, z, dst)->get_workspace_in_bytes(args);
  234. };
  235. size_t ConvBiasForwardImpl::get_preprocess_workspace_in_bytes(
  236. const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
  237. const TensorLayout& z, const TensorLayout& dst) {
  238. AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
  239. return get_algorithm(this, src, filter, bias, z, dst)
  240. ->get_preprocess_workspace_in_bytes(args);
  241. }
  242. SmallVector<TensorLayout> ConvBiasForwardImpl::deduce_preprocessed_filter_layout(
  243. const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias,
  244. const TensorLayout& z, const TensorLayout& dst) {
  245. AlgoBase::SizeArgs args{this, src, filter, bias, z, dst};
  246. return get_algorithm(this, src, filter, bias, z, dst)
  247. ->deduce_preprocessed_filter_layout(args);
  248. }
  249. void ConvBiasForwardImpl::exec_preprocess(
  250. const TensorLayout& src_layout, _megdnn_tensor_in filter,
  251. _megdnn_tensor_in bias, const TensorLayout& z_layout,
  252. const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter,
  253. _megdnn_workspace workspace) {
  254. TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}, z{nullptr, z_layout};
  255. AlgoBase::ExecArgs args(
  256. this, src, filter, bias, z, dst, workspace, preprocessed_filter);
  257. auto algo = get_algorithm(
  258. this, src.layout, filter.layout, bias.layout, z.layout, dst.layout);
  259. return algo->exec_preprocess(args);
  260. }
  261. } // namespace cuda
  262. } // namespace megdnn
  263. // vim: syntax=cpp.doxygen