You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.cpp 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. /**
  2. * \file dnn/src/rocm/convolution/opr_impl.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "hcc_detail/hcc_defs_prologue.h"
  12. #include "./backward_data/algo.h"
  13. #include "./backward_filter/algo.h"
  14. #include "./forward/algo.h"
  15. #include "./opr_impl.h"
  16. #include "src/common/algo_chooser.h"
  17. #include "src/rocm/utils.h"
  18. using namespace megdnn;
  19. using namespace rocm;
  20. #define TO_STRING2(v) #v
  21. #define TO_STRING(v) TO_STRING2(v)
  22. #define MIOPEN_VERSION_STR \
  23. TO_STRING(MIOPEN_VERSION_MAJOR) \
  24. "." TO_STRING(MIOPEN_VERSION_MINOR) "." TO_STRING(MIOPEN_VERSION_PATCH)
  25. /* ============== ConvolutionForwardImpl ============== */
  26. ConvolutionForwardImpl::Algorithm*
  27. ConvolutionForwardImpl::get_algorithm_heuristic(const TensorLayout& src,
  28. const TensorLayout& filter,
  29. const TensorLayout& dst,
  30. size_t workspace_limit_in_bytes,
  31. bool reproducible) {
  32. auto fm = check_layout_fwd(src, filter, dst);
  33. return get_algorithm_heuristic(src, fm, dst, workspace_limit_in_bytes,
  34. reproducible);
  35. }
  36. ConvolutionForwardImpl::Algorithm*
  37. ConvolutionForwardImpl::get_algorithm_heuristic(
  38. const TensorLayout& src, const CanonizedFilterMeta& filter,
  39. const TensorLayout& dst, size_t workspace_limit_in_bytes,
  40. bool reproducible) {
  41. AlgoBase::SizeArgs args(this, src, filter, dst);
  42. //! MIOpen auto-tuning need to run with actual tensors, so we cannot get
  43. //! best algorithm here.
  44. if (is_miopen_supported(args)) {
  45. auto algo = megdnn::get_reproducible_algo<ConvolutionForwardImpl>(
  46. sm_algo_pack.miopen_algos[0], reproducible);
  47. if (algo)
  48. return algo;
  49. }
  50. if (args.filter_meta.group > 1) {
  51. if (sm_algo_pack.chanwise.is_available_reproducible(
  52. args, reproducible, workspace_limit_in_bytes)) {
  53. return &sm_algo_pack.chanwise;
  54. }
  55. }
  56. auto prefer_1x1 = [&args, reproducible, workspace_limit_in_bytes]() {
  57. const size_t MAX_BATCH_SIZE_FOR_1x1_MAT_ALGO = 4;
  58. size_t batch_size = args.src_layout->shape[0];
  59. if (batch_size > MAX_BATCH_SIZE_FOR_1x1_MAT_ALGO) {
  60. return false;
  61. }
  62. return sm_algo_pack.a1x1.is_available_reproducible(
  63. args, reproducible, workspace_limit_in_bytes);
  64. };
  65. if (prefer_1x1()) {
  66. return &sm_algo_pack.a1x1;
  67. }
  68. auto prefer_1x1_large_batch = [&args, reproducible,
  69. workspace_limit_in_bytes]() {
  70. const size_t MIN_BATCH_SIZE_FOR_1x1_LARGE_BATCH_ALGO = 32;
  71. size_t batch_size = args.src_layout->shape[0];
  72. if (batch_size < MIN_BATCH_SIZE_FOR_1x1_LARGE_BATCH_ALGO) {
  73. return false;
  74. }
  75. return sm_algo_pack.batched_matrix_mul.is_available_reproducible(
  76. args, reproducible, workspace_limit_in_bytes);
  77. };
  78. if (prefer_1x1_large_batch()) {
  79. return &sm_algo_pack.batched_matrix_mul;
  80. }
  81. if (reproducible) {
  82. return megdnn::get_reproducible_algo<ConvolutionForwardImpl>(
  83. sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
  84. "rocm conv fwd");
  85. } else {
  86. return megdnn::get_usable_algo<ConvolutionForwardImpl>(
  87. sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
  88. "rocm conv fwd");
  89. }
  90. }
  91. std::vector<ConvolutionForwardImpl::Algorithm*>
  92. ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src,
  93. const TensorLayout& filter,
  94. const TensorLayout& dst) {
  95. return megdnn::get_all_algorithms<ConvolutionForwardImpl>(
  96. {this, src, filter, dst});
  97. }
  98. size_t ConvolutionForwardImpl::get_workspace_in_bytes(
  99. const TensorLayout& src, const TensorLayout& filter,
  100. const TensorLayout& dst, const PreprocessedFilter*) {
  101. AlgoBase::SizeArgs args(this, src, filter, dst);
  102. return get_algorithm(this, src, args.filter_meta, dst)
  103. ->get_workspace_in_bytes(args);
  104. }
  105. void ConvolutionForwardImpl::exec(_megdnn_tensor_in src,
  106. _megdnn_tensor_in filter,
  107. _megdnn_tensor_out dst,
  108. const PreprocessedFilter*,
  109. _megdnn_workspace workspace) {
  110. AlgoBase::ExecArgs args(this, src, filter, dst, workspace);
  111. auto algo = get_algorithm(this, src.layout, args.filter_meta, dst.layout);
  112. algo->check_workspace(args, workspace).exec(args);
  113. }
  114. const char* ConvolutionForwardImpl::get_algorithm_set_name() const {
  115. return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR;
  116. }
  117. /* ============== ConvolutionBackwardDataImpl ============== */
  118. void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter,
  119. _megdnn_tensor_in diff,
  120. _megdnn_tensor_out grad,
  121. _megdnn_workspace workspace) {
  122. AlgoBase::ExecArgs args(this, filter, diff, grad, workspace);
  123. auto algo = get_algorithm(this, args.filter_meta, diff.layout, grad.layout);
  124. algo->check_workspace(args, workspace).exec(args);
  125. }
  126. std::vector<ConvolutionBackwardDataImpl::Algorithm*>
  127. ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout& filter,
  128. const TensorLayout& diff,
  129. const TensorLayout& grad) {
  130. return megdnn::get_all_algorithms<ConvolutionBackwardDataImpl>(
  131. {this, filter, diff, grad});
  132. }
  133. ConvolutionBackwardDataImpl::Algorithm*
  134. ConvolutionBackwardDataImpl::get_algorithm_heuristic(
  135. const TensorLayout& filter, const TensorLayout& diff,
  136. const TensorLayout& grad, size_t workspace_limit_in_bytes,
  137. bool reproducible) {
  138. auto fm = check_layout_fwd(grad, filter, diff);
  139. return get_algorithm_heuristic(fm, diff, grad, workspace_limit_in_bytes,
  140. reproducible);
  141. }
  142. ConvolutionBackwardDataImpl::Algorithm*
  143. ConvolutionBackwardDataImpl::get_algorithm_heuristic(
  144. const CanonizedFilterMeta& filter, const TensorLayout& diff,
  145. const TensorLayout& grad, size_t workspace_limit_in_bytes,
  146. bool reproducible) {
  147. AlgoBase::SizeArgs args(this, filter, diff, grad);
  148. if (is_miopen_supported(args.as_fwd_args())) {
  149. auto algo = megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>(
  150. sm_algo_pack.miopen_algos[0], reproducible);
  151. if (algo)
  152. return algo;
  153. }
  154. if (args.filter_meta.group > 1 &&
  155. sm_algo_pack.chanwise.is_available_reproducible(
  156. args, reproducible, workspace_limit_in_bytes)) {
  157. return &sm_algo_pack.chanwise;
  158. }
  159. if (reproducible) {
  160. return megdnn::get_reproducible_algo<ConvolutionBackwardDataImpl>(
  161. sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
  162. "rocm conv bwd_data");
  163. } else {
  164. return megdnn::get_usable_algo<ConvolutionBackwardDataImpl>(
  165. sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
  166. "rocm conv bwd_data");
  167. }
  168. }
  169. size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(
  170. const TensorLayout& filter, const TensorLayout& diff,
  171. const TensorLayout& grad) {
  172. AlgoBase::SizeArgs args(this, filter, diff, grad);
  173. return get_algorithm(this, args.filter_meta, diff, grad)
  174. ->get_workspace_in_bytes(args);
  175. }
  176. const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const {
  177. return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR;
  178. }
  179. /* ============== ConvolutionBackwardFilterImpl ============== */
  180. void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src,
  181. _megdnn_tensor_in diff,
  182. _megdnn_tensor_out grad,
  183. _megdnn_workspace workspace) {
  184. AlgoBase::ExecArgs args(this, src, diff, grad, workspace);
  185. auto algo =
  186. get_algorithm(this, src.layout, diff.layout, args.grad_filter_meta);
  187. algo->check_workspace(args, workspace).exec(args);
  188. }
  189. std::vector<ConvolutionBackwardFilterImpl::Algorithm*>
  190. ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout& src,
  191. const TensorLayout& diff,
  192. const TensorLayout& grad) {
  193. return megdnn::get_all_algorithms<ConvolutionBackwardFilterImpl>(
  194. {this, src, diff, grad});
  195. }
  196. ConvolutionBackwardFilterImpl::Algorithm*
  197. ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
  198. const TensorLayout& src, const TensorLayout& diff,
  199. const TensorLayout& grad, size_t workspace_limit_in_bytes,
  200. bool reproducible) {
  201. auto fm = check_layout_fwd(src, grad, diff);
  202. return get_algorithm_heuristic(src, diff, fm, workspace_limit_in_bytes,
  203. reproducible);
  204. }
  205. ConvolutionBackwardFilterImpl::Algorithm*
  206. ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
  207. const TensorLayout& src, const TensorLayout& diff,
  208. const CanonizedFilterMeta& grad, size_t workspace_limit_in_bytes,
  209. bool reproducible) {
  210. AlgoBase::SizeArgs args(this, src, diff, grad);
  211. if (is_miopen_supported(args.as_fwd_args())) {
  212. auto algo =
  213. megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>(
  214. sm_algo_pack.miopen_algos[0], reproducible);
  215. if (algo)
  216. return algo;
  217. }
  218. if (args.grad_filter_meta.group > 1 &&
  219. sm_algo_pack.chanwise.is_available_reproducible(
  220. args, reproducible, workspace_limit_in_bytes)) {
  221. // prefer special chanwise impl
  222. return &sm_algo_pack.chanwise;
  223. }
  224. if (reproducible) {
  225. return megdnn::get_reproducible_algo<ConvolutionBackwardFilterImpl>(
  226. sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
  227. "rocm conv bwd_filter");
  228. } else {
  229. return megdnn::get_usable_algo<ConvolutionBackwardFilterImpl>(
  230. sm_algo_pack.non_miopen_algos, args, workspace_limit_in_bytes,
  231. "rocm conv bwd_filter");
  232. }
  233. }
  234. size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes(
  235. const TensorLayout& src, const TensorLayout& diff,
  236. const TensorLayout& grad) {
  237. AlgoBase::SizeArgs args(this, src, diff, grad);
  238. return get_algorithm(this, src, diff, args.grad_filter_meta)
  239. ->get_workspace_in_bytes(args);
  240. }
  241. const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const {
  242. return "ROCMCONV0+MIOPEN" MIOPEN_VERSION_STR;
  243. }
  244. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台