You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.cpp 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. /**
  2. * \file dnn/src/naive/local_share/opr_impl.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/naive/local_share/opr_impl.h"
  12. #include "src/naive/convolution/helper.h"
  13. #include <cstring>
  14. #include "src/common/utils.h"
  15. #include "src/naive/handle.h"
  16. using namespace megdnn;
  17. using namespace naive;
  18. using namespace convolution;
  19. namespace {
  20. template <typename stype, typename ftype, typename dtype, typename comp_type,
  21. class Strategy>
  22. void naive_kern(_megdnn_tensor_in src, _megdnn_tensor_in filter,
  23. _megdnn_tensor_out dst, LocalShare::Param param) {
  24. size_t spatial_start, channel_pos, kern_spatial_start;
  25. spatial_start = 2;
  26. channel_pos = 1;
  27. kern_spatial_start = 3;
  28. size_t groups = 1;
  29. if (param.sparse == LocalShare::Param::Sparse::GROUP) {
  30. kern_spatial_start = 4;
  31. groups = filter.layout.shape[0];
  32. }
  33. auto N = src.layout.shape[0], IC = src.layout.shape[channel_pos],
  34. IH = src.layout.shape[spatial_start],
  35. IW = src.layout.shape[spatial_start + 1];
  36. auto FH = filter.layout.shape[kern_spatial_start],
  37. FW = filter.layout.shape[kern_spatial_start + 1];
  38. auto OC = dst.layout.shape[channel_pos],
  39. OH = dst.layout.shape[spatial_start],
  40. OW = dst.layout.shape[spatial_start + 1];
  41. size_t icpg = IC / groups, ocpg = OC / groups;
  42. size_t SGH = param.spatial_groups_h, SGW = param.spatial_groups_w;
  43. size_t GRP_OH = OH / SGH, GRP_OW = OW / SGW;
  44. size_t FS_G, FS_OC, FS_IC, FS_SPATIAL;
  45. // sgh, sgw, ic, fh, fw, oc
  46. FS_OC = 1;
  47. FS_SPATIAL = FS_OC * ocpg;
  48. FS_IC = FH * FW * FS_SPATIAL;
  49. FS_G = FS_IC * icpg * SGH * SGW;
  50. size_t PH = param.pad_h, PW = param.pad_w;
  51. size_t SH = param.stride_h, SW = param.stride_w;
  52. size_t dh = param.dilate_h, dw = param.dilate_w;
  53. megdnn_assert(param.dilate_h == 1 && param.dilate_w == 1);
  54. stype* __restrict sptr = src.compatible_ptr<stype>();
  55. ftype* __restrict fptr = filter.compatible_ptr<ftype>();
  56. dtype* __restrict dptr = dst.compatible_ptr<dtype>();
  57. int h_offset = -PH, w_offset = -PW;
  58. auto get_linear_addr = [](ptrdiff_t n, ptrdiff_t c, ptrdiff_t h,
  59. ptrdiff_t w,
  60. const TensorLayout& layout) -> ptrdiff_t {
  61. return n * layout.stride[0] + c * layout.stride[1] +
  62. h * layout.stride[2] + w * layout.stride[3];
  63. };
  64. auto get_filter_addr = [&](GroupCounter& gc_out, size_t ic, size_t ic0,
  65. size_t fh, size_t fw) {
  66. return gc_out.cur_grp * FS_G + gc_out.cur_off * FS_OC +
  67. (ic - ic0) * FS_IC + (fh * FW + fw) * FS_SPATIAL;
  68. };
  69. for (size_t n = 0; n < N; ++n) {
  70. GroupCounter gc_out{ocpg};
  71. for (size_t oc = 0; oc < OC; ++oc, gc_out.next()) {
  72. for (size_t oh = 0; oh < OH; ++oh) {
  73. for (size_t ow = 0; ow < OW; ++ow) {
  74. comp_type dval =
  75. dptr[get_linear_addr(n, oc, oh, ow, dst.layout)];
  76. Strategy::init_dval(dval);
  77. size_t grp_oh = oh / GRP_OH, grp_ow = ow / GRP_OW;
  78. ftype* fptr_cur = fptr + (grp_oh * SGW + grp_ow) * ocpg *
  79. icpg * FH * FW;
  80. for (size_t fh = 0; fh < FH; ++fh) {
  81. for (size_t fw = 0; fw < FW; ++fw) {
  82. uint32_t ih = SH * oh + fh * dh + h_offset,
  83. iw = SW * ow + fw * dw + w_offset;
  84. // here ih and iw are represented in unsigned int
  85. // they will become very large if underflow occurs
  86. if (ih < IH && iw < IW) {
  87. size_t ic0 = gc_out.cur_grp * icpg,
  88. ic1 = ic0 + icpg;
  89. for (size_t ic = ic0; ic < ic1; ++ic) {
  90. stype& sval = sptr[get_linear_addr(
  91. n, ic, ih, iw, src.layout)];
  92. ftype& fval = fptr_cur[get_filter_addr(
  93. gc_out, ic, ic0, fh, fw)];
  94. Strategy::on(sval, fval, dval,
  95. src.layout.dtype,
  96. filter.layout.dtype,
  97. dst.layout.dtype);
  98. }
  99. }
  100. }
  101. }
  102. Strategy::write(
  103. dval,
  104. dptr[get_linear_addr(n, oc, oh, ow, dst.layout)]);
  105. }
  106. }
  107. }
  108. }
  109. }
  110. } // namespace
  111. void LocalShareForwardImpl::exec(_megdnn_tensor_in src,
  112. _megdnn_tensor_in filter,
  113. _megdnn_tensor_out dst,
  114. _megdnn_workspace workspace) {
  115. check_exec(src.layout, filter.layout, dst.layout, workspace.size);
  116. MEGDNN_DISPATCH_CPU_KERN_OPR(
  117. (naive_kern<dt_float32, dt_float32, dt_float32, dt_float32,
  118. StrategyFwd>(src, filter, dst, param())););
  119. }
  120. void LocalShareBackwardDataImpl::exec(_megdnn_tensor_in filter,
  121. _megdnn_tensor_in diff,
  122. _megdnn_tensor_out grad,
  123. _megdnn_workspace workspace) {
  124. check_exec(filter.layout, diff.layout, grad.layout, workspace.size);
  125. MEGDNN_DISPATCH_CPU_KERN_OPR(
  126. (naive_kern<dt_float32, dt_float32, dt_float32, dt_float32,
  127. StrategyBwdData>(grad, filter, diff, param())););
  128. }
  129. void LocalShareBackwardFilterImpl::exec(_megdnn_tensor_in src,
  130. _megdnn_tensor_in diff,
  131. _megdnn_tensor_out grad,
  132. _megdnn_workspace workspace) {
  133. check_exec(src.layout, diff.layout, grad.layout, workspace.size);
  134. MEGDNN_DISPATCH_CPU_KERN_OPR(
  135. (naive_kern<dt_float32, dt_float32, dt_float32, dt_float32,
  136. StrategyBwdFlt>(src, grad, diff, param())););
  137. }
  138. std::vector<LocalShareForward::Algorithm*>
  139. LocalShareForwardImpl::get_all_algorithms(const TensorLayout&,
  140. const TensorLayout&,
  141. const TensorLayout&) {
  142. return {static_cast<HandleImpl*>(handle())->default_local_share_fwd_algo()};
  143. }
  144. LocalShareForward::Algorithm* LocalShareForwardImpl::get_algorithm_heuristic(
  145. const TensorLayout& /* src */, const TensorLayout& /* diff */,
  146. const TensorLayout& /* grad */, size_t /* workspace_limit_in_bytes */,
  147. const AlgoAttribute& positive_attr,
  148. const AlgoAttribute& negative_attr) {
  149. auto algo =
  150. static_cast<HandleImpl*>(handle())->default_local_share_fwd_algo();
  151. algo->check_attribute(positive_attr, negative_attr);
  152. return algo;
  153. }
  154. LocalShareForward::Algorithm*
  155. LocalShareForwardImpl::get_algorithm_from_desc(
  156. const AlgorithmDesc& desc) {
  157. Algorithm* ret =
  158. static_cast<HandleImpl*>(handle())->default_local_share_fwd_algo();
  159. megdnn_assert(desc == ret->info().desc);
  160. return ret;
  161. }
  162. std::vector<LocalShareBackwardData::Algorithm*>
  163. LocalShareBackwardDataImpl::get_all_algorithms(const TensorLayout&,
  164. const TensorLayout&,
  165. const TensorLayout&) {
  166. return {static_cast<HandleImpl*>(handle())
  167. ->default_local_share_bwd_data_algo()};
  168. }
  169. LocalShareBackwardData::Algorithm*
  170. LocalShareBackwardDataImpl::get_algorithm_heuristic(
  171. const TensorLayout& /* filter */, const TensorLayout& /* diff */,
  172. const TensorLayout& /* grad */, size_t /* workspace_limit_in_bytes */,
  173. const AlgoAttribute& positive_attr,
  174. const AlgoAttribute& negative_attr) {
  175. auto algo = static_cast<HandleImpl*>(handle())
  176. ->default_local_share_bwd_data_algo();
  177. algo->check_attribute(positive_attr, negative_attr);
  178. return algo;
  179. }
  180. LocalShareBackwardData::Algorithm*
  181. LocalShareBackwardDataImpl::get_algorithm_from_desc(
  182. const AlgorithmDesc& desc) {
  183. Algorithm* ret = static_cast<HandleImpl*>(handle())
  184. ->default_local_share_bwd_data_algo();
  185. megdnn_assert(desc == ret->info().desc);
  186. return ret;
  187. }
  188. std::vector<LocalShareBackwardFilter::Algorithm*>
  189. LocalShareBackwardFilterImpl::get_all_algorithms(const TensorLayout&,
  190. const TensorLayout&,
  191. const TensorLayout&) {
  192. return {static_cast<HandleImpl*>(handle())
  193. ->default_local_share_bwd_filter_algo()};
  194. }
  195. LocalShareBackwardFilter::Algorithm*
  196. LocalShareBackwardFilterImpl::get_algorithm_heuristic(
  197. const TensorLayout& /* src */, const TensorLayout& /* diff */,
  198. const TensorLayout& /* grad */, size_t /* workspace_limit_in_bytes */,
  199. const AlgoAttribute& positive_attr,
  200. const AlgoAttribute& negative_attr) {
  201. auto algo = static_cast<HandleImpl*>(handle())
  202. ->default_local_share_bwd_filter_algo();
  203. algo->check_attribute(positive_attr, negative_attr);
  204. return algo;
  205. }
  206. LocalShareBackwardFilter::Algorithm*
  207. LocalShareBackwardFilterImpl::get_algorithm_from_desc(
  208. const AlgorithmDesc& desc) {
  209. Algorithm* ret = static_cast<HandleImpl*>(handle())
  210. ->default_local_share_bwd_filter_algo();
  211. megdnn_assert(desc == ret->info().desc);
  212. return ret;
  213. }
  214. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台