You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

matmul.cpp 7.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. /**
  2. * \file dnn/src/cuda/convolution/backward_data/matmul.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "./algo.h"
  13. #include "src/cuda/convolution/helper.h"
  14. #include "src/cuda/convolution/im2col.cuh"
  15. #include "src/cuda/matrix_mul/opr_impl.h"
  16. #include "src/cuda/utils.h"
  17. using namespace megdnn;
  18. using namespace cuda;
  19. namespace {
  20. std::pair<TensorLayoutArray, MatrixMulForward::Param> sub_opr_config(
  21. const ConvolutionBackwardDataImpl::CanonizedFilterMeta& fm,
  22. const TensorLayout& filter_layout, const TensorLayout& diff_layout,
  23. const TensorLayout& grad_layout,
  24. const ConvolutionBackwardDataImpl* opr) {
  25. size_t N = grad_layout.shape[0], IC = fm.icpg,
  26. OC = fm.ocpg, OH = diff_layout.shape[2],
  27. OW = diff_layout.shape[3], FH = fm.spatial[0],
  28. FW = fm.spatial[1];
  29. megdnn_assert(filter_layout.dtype.enumv() == diff_layout.dtype.enumv());
  30. TensorLayout Al({OC, IC * FH * FW}, filter_layout.dtype),
  31. Bl({IC * FH * FW, OH * OW * N}, filter_layout.dtype),
  32. Cl({OC, OH * OW * N}, filter_layout.dtype);
  33. MatrixMulForward::Param param;
  34. if (opr->param().compute_mode ==
  35. param::Convolution::ComputeMode::FLOAT32) {
  36. param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
  37. }
  38. param.transposeA = true;
  39. return {{Al, Cl, Bl}, param};
  40. }
  41. } // namespace
  42. std::vector<Algorithm::SearchItem>
  43. ConvolutionBackwardDataImpl::AlgoMatmul::get_subopr_list(
  44. const TensorLayoutArray& layouts, const OperatorBase* opr) const {
  45. const ConvolutionBackwardDataImpl* conv_backward_data_opr =
  46. static_cast<const ConvolutionBackwardDataImpl*>(opr);
  47. CanonizedFilterMeta fm = conv_backward_data_opr->check_layout_fwd(
  48. layouts[2], layouts[0], layouts[1]);
  49. auto&& config = sub_opr_config(fm, layouts[0], layouts[1], layouts[2],
  50. conv_backward_data_opr);
  51. std::string param_str;
  52. Algorithm::serialize_write_pod(config.second, param_str);
  53. return {{Algorithm::OprType::MATRIX_MUL_FORWARD, param_str,
  54. config.first}};
  55. }
  56. bool ConvolutionBackwardDataImpl::AlgoMatmul::is_available(
  57. const SizeArgs& args) const {
  58. if (args.diff_layout->dtype == args.filter_layout->dtype &&
  59. args.diff_layout->dtype == dtype::BFloat16()) {
  60. return false;
  61. }
  62. auto&& fm = args.filter_meta;
  63. return args.filter_meta.format == Param::Format::NCHW &&
  64. args.diff_layout->dtype.category() == DTypeCategory::FLOAT &&
  65. fm.group == 1 && fm.spatial_ndim == 2;
  66. }
  67. size_t ConvolutionBackwardDataImpl::AlgoMatmul::get_workspace_in_bytes(
  68. const SizeArgs& args) const {
  69. auto matmul_opr =
  70. args.handle->create_operator<MatrixMulForward>();
  71. if (args.opr->execution_policy().algo.valid() &&
  72. !args.opr->execution_policy().sub_policy.empty()) {
  73. megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
  74. matmul_opr->execution_policy() =
  75. args.opr->execution_policy().sub_policy[0];
  76. }
  77. auto&& config =
  78. sub_opr_config(args.filter_meta, *args.filter_layout,
  79. *args.diff_layout, *args.grad_layout, args.opr);
  80. matmul_opr->param() = config.second;
  81. auto&& sizes = matmul_get_workspace_bundle(args.as_fwd_args());
  82. sizes.push_back(matmul_opr->get_workspace_in_bytes(
  83. config.first[0], config.first[1], config.first[2]));
  84. return WorkspaceBundle(nullptr, sizes).total_size_in_bytes();
  85. }
  86. void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs& args) const {
  87. #define cb(DType) \
  88. if (args.diff_layout->dtype == DType()) { \
  89. using ctype = typename DTypeTrait<DType>::ctype; \
  90. exec_internal<ctype>(args); \
  91. return; \
  92. }
  93. MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT(cb)
  94. #undef cb
  95. megdnn_assert_internal(0);
  96. }
  97. template <typename T>
  98. void ConvolutionBackwardDataImpl::AlgoMatmul::exec_internal(
  99. const ExecArgs& args) {
  100. auto&& fm = args.filter_meta;
  101. size_t N = args.grad_layout->shape[0], IC = fm.icpg,
  102. IH = args.grad_layout->shape[2], IW = args.grad_layout->shape[3],
  103. OC = fm.ocpg, OH = args.diff_layout->shape[2],
  104. OW = args.diff_layout->shape[3], FH = fm.spatial[0],
  105. FW = fm.spatial[1], PH = fm.padding[0], PW = fm.padding[1],
  106. SH = fm.stride[0], SW = fm.stride[1], DH = fm.dilation[0],
  107. DW = fm.dilation[1];
  108. auto stream = cuda_stream(args.handle);
  109. auto matmul_opr = args.handle->create_operator<MatrixMulForward>();
  110. if (args.opr->execution_policy().algo.valid()) {
  111. megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1);
  112. matmul_opr->execution_policy() =
  113. args.opr->execution_policy().sub_policy[0];
  114. }
  115. auto&& config =
  116. sub_opr_config(args.filter_meta, *args.filter_layout,
  117. *args.diff_layout, *args.grad_layout, args.opr);
  118. matmul_opr->param() = config.second;
  119. auto&& sizes = matmul_get_workspace_bundle(args.as_fwd_args());
  120. sizes.push_back(matmul_opr->get_workspace_in_bytes(
  121. config.first[0], config.first[1], config.first[2]));
  122. auto wbundle = WorkspaceBundle(args.workspace.raw_ptr, sizes);
  123. T* diff_t = static_cast<T*>(wbundle.get(0));
  124. T* col = static_cast<T*>(wbundle.get(1));
  125. {
  126. // transpose diff
  127. TensorLayout froml({N, OC * OH * OW}, typename DTypeTrait<T>::dtype()),
  128. tol(froml);
  129. froml.stride[0] = args.diff_layout->stride[0];
  130. tol.stride[0] = 1;
  131. tol.stride[1] = N;
  132. TensorND from(args.diff_tensor->ptr<T>(), froml), to(diff_t, tol);
  133. args.handle->relayout_opr()->exec(from, to);
  134. }
  135. {
  136. // take gemm grad
  137. TensorLayout Al({OC, IC * FH * FW}, typename DTypeTrait<T>::dtype()),
  138. Bl({IC * FH * FW, OH * OW * N},
  139. typename DTypeTrait<T>::dtype()),
  140. Cl({OC, OH * OW * N}, typename DTypeTrait<T>::dtype());
  141. TensorND A(args.filter_tensor->ptr<T>(), Al), B(col, Bl), C(diff_t, Cl);
  142. if (fm.should_flip) {
  143. convolution::flip_filter(args.as_fwd_args(),
  144. wbundle.get_workspace(2), A.raw_ptr);
  145. matmul_opr->exec(A, C, B, wbundle.get_workspace(3));
  146. } else {
  147. matmul_opr->exec(A, C, B, wbundle.get_workspace(2));
  148. }
  149. }
  150. {
  151. // col2im
  152. convolution::col2im<T>(col, args.grad_tensor->ptr<T>(), N,
  153. args.grad_layout->stride[0], IC, IH, IW, FH, FW,
  154. OH, OW, PH, PW, SH, SW, DH, DW, stream);
  155. }
  156. }
  157. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台