You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_proxy.h 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. /**
  2. * \file dnn/test/common/opr_proxy.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include "test/common/deduce_layout_proxy.h"
  14. #include "test/common/exec_proxy.h"
  15. #include "test/common/inspect_type.h"
  16. #include "test/common/opr_algo_proxy.h"
  17. #include "test/common/opr_trait.h"
  18. #include "test/common/timer.h"
  19. #include "test/common/workspace_wrapper.h"
  20. #include <algorithm>
  21. #include <memory>
  22. namespace megdnn {
  23. namespace test {
  24. template <typename Opr, size_t arity = OprTrait<Opr>::arity,
  25. bool has_workspace = OprTrait<Opr>::has_workspace,
  26. bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout>
  27. struct OprProxyDefaultImpl
  28. : public DeduceLayoutProxy<Opr, arity, can_deduce_layout>,
  29. public ExecProxy<Opr, arity, has_workspace> {};
  30. template <typename Opr>
  31. struct OprProxy : public OprProxyDefaultImpl<Opr> {};
  32. template <typename Opr>
  33. struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {};
  34. template <typename Opr>
  35. struct OprProxyVectorToSingle {};
  36. template <>
  37. struct OprProxy<ElemwiseForward> {
  38. static void deduce_layout(ElemwiseForward* opr,
  39. TensorLayoutArray& layouts) {
  40. megdnn_assert(layouts.size() >= 2);
  41. auto inp = layouts;
  42. inp.pop_back();
  43. opr->deduce_layout(inp, layouts.back());
  44. }
  45. static void exec(ElemwiseForward* opr, const TensorNDArray& tensors) {
  46. megdnn_assert(tensors.size() >= 2);
  47. auto inp = tensors;
  48. inp.pop_back();
  49. opr->exec(inp, tensors.back());
  50. }
  51. };
  52. template <>
  53. struct OprProxy<ElemwiseMultiType> {
  54. static void deduce_layout(ElemwiseMultiType* opr,
  55. TensorLayoutArray& layouts) {
  56. megdnn_assert(layouts.size() >= 2);
  57. auto inp = layouts;
  58. inp.pop_back();
  59. opr->deduce_layout(inp, layouts.back());
  60. }
  61. static void exec(ElemwiseMultiType* opr, const TensorNDArray& tensors) {
  62. megdnn_assert(tensors.size() >= 2);
  63. auto inp = tensors;
  64. inp.pop_back();
  65. opr->exec(inp, tensors.back());
  66. }
  67. };
  68. template <>
  69. struct OprProxy<ConcatForward> {
  70. static void deduce_layout(ConcatForward* opr, TensorLayoutArray& layouts) {
  71. megdnn_assert(layouts.size() >= 2);
  72. auto inp = layouts;
  73. inp.pop_back();
  74. opr->deduce_layout(inp, layouts.back());
  75. }
  76. static void exec(ConcatForward* opr, const TensorNDArray& tensors) {
  77. megdnn_assert(tensors.size() >= 2);
  78. auto inp = tensors;
  79. inp.pop_back();
  80. TensorLayoutArray layouts(tensors.size());
  81. std::transform(tensors.begin(), tensors.end(), layouts.begin(),
  82. [](const TensorND& tensor) { return tensor.layout; });
  83. auto inp_layouts = layouts;
  84. inp_layouts.pop_back();
  85. WorkspaceWrapper W(opr->handle(), opr->get_workspace_in_bytes(
  86. inp_layouts, layouts.back()));
  87. auto inp_tensors = tensors;
  88. inp_tensors.pop_back();
  89. opr->exec(inp_tensors, tensors.back(), W.workspace());
  90. }
  91. };
  92. template <>
  93. struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> {
  94. static void exec(SplitForward* opr, const TensorNDArray& tensors) {
  95. megdnn_assert(tensors.size() >= 2);
  96. auto out = tensors;
  97. out.erase(out.begin());
  98. TensorLayoutArray layouts(tensors.size());
  99. std::transform(tensors.begin(), tensors.end(), layouts.begin(),
  100. [](const TensorND& tensor) { return tensor.layout; });
  101. auto out_layouts = layouts;
  102. out_layouts.erase(out_layouts.begin());
  103. WorkspaceWrapper W(
  104. opr->handle(),
  105. opr->get_workspace_in_bytes(layouts.front(), out_layouts));
  106. auto out_tensors = tensors;
  107. out_tensors.erase(out_tensors.begin());
  108. opr->exec(tensors.front(), out_tensors, W.workspace());
  109. }
  110. };
  111. //! OprProxy impl for tenary oprs with profiling support
  112. template <class Opr, int arity>
  113. struct OprProxyProfilingBase
  114. : public DeduceLayoutProxy<Opr, arity,
  115. OprTrait<Opr>::can_deduce_layout> {
  116. size_t warmup_times = 10, exec_times = 100;
  117. //! whether to enable profiling
  118. bool m_profiling;
  119. WorkspaceWrapper W;
  120. //! target algo setup by profiler; it can also be directly specified by the
  121. //! caller
  122. typename Opr::AlgorithmInfo target_algo_info;
  123. OprProxyProfilingBase(bool profile = false) { m_profiling = profile; }
  124. //! used for alloc tensor for weight preprocess
  125. static std::shared_ptr<TensorNDArray> alloc_tensors(
  126. Handle* handle, const TensorLayoutArray& layouts) {
  127. auto deleter = [handle](TensorNDArray* ptr) {
  128. for (auto&& i : *ptr) {
  129. auto pdata = static_cast<dt_byte*>(i.raw_ptr) +
  130. i.layout.span().low_byte;
  131. megdnn_free(handle, pdata);
  132. }
  133. delete ptr;
  134. };
  135. std::shared_ptr<TensorNDArray> ret{new TensorNDArray, deleter};
  136. for (size_t i = 0; i < layouts.size(); ++i) {
  137. auto span = layouts[i].span();
  138. ret->emplace_back(static_cast<dt_byte*>(
  139. megdnn_malloc(handle, span.dist_byte())) -
  140. span.low_byte,
  141. layouts[i]);
  142. }
  143. return ret;
  144. }
  145. void exec(Opr* opr, const TensorNDArray& tensors) {
  146. megdnn_assert(tensors.size() == arity);
  147. if (!W.valid()) {
  148. W = WorkspaceWrapper(opr->handle(), 0);
  149. }
  150. TensorLayoutArray layouts;
  151. for (auto&& tensor : tensors) {
  152. layouts.push_back(tensor.layout);
  153. }
  154. if (m_profiling && !target_algo_info.valid()) {
  155. size_t min_time = std::numeric_limits<size_t>::max();
  156. for (auto algo :
  157. AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) {
  158. opr->execution_policy().algo = algo;
  159. auto workspace_size =
  160. AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr,
  161. layouts);
  162. W.update(workspace_size);
  163. for (size_t times = 0; times < warmup_times; ++times)
  164. AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
  165. megcoreSynchronize(opr->handle()->megcore_computing_handle());
  166. Timer timer;
  167. timer.start();
  168. for (size_t times = 0; times < exec_times; ++times) {
  169. AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
  170. }
  171. megcoreSynchronize(opr->handle()->megcore_computing_handle());
  172. timer.stop();
  173. printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
  174. algo.name.c_str());
  175. if (min_time > timer.get_time_in_us()) {
  176. min_time = timer.get_time_in_us();
  177. target_algo_info = algo;
  178. }
  179. }
  180. opr->execution_policy().algo = target_algo_info;
  181. auto workspace_size =
  182. AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts);
  183. W.update(workspace_size);
  184. }
  185. if (!target_algo_info.valid()) {
  186. auto workspace_size =
  187. AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts);
  188. W.update(workspace_size);
  189. }
  190. AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
  191. }
  192. };
  193. #define DEF_PROF(c, arity) \
  194. template <> \
  195. struct OprProxy<c> : public OprProxyProfilingBase<c, arity> { \
  196. using OprProxyProfilingBase<c, arity>::OprProxyProfilingBase; \
  197. }
  198. DEF_PROF(ConvolutionForward, 3);
  199. DEF_PROF(ConvolutionBackwardData, 3);
  200. DEF_PROF(ConvolutionBackwardFilter, 3);
  201. DEF_PROF(LocalShareForward, 3);
  202. DEF_PROF(LocalShareBackwardData, 3);
  203. DEF_PROF(LocalShareBackwardFilter, 3);
  204. DEF_PROF(DeformableConvForward, 5);
  205. DEF_PROF(DeformableConvBackwardFilter, 5);
  206. DEF_PROF(BatchConvBiasForward, 5);
  207. DEF_PROF(ConvBiasForward, 5);
  208. DEF_PROF(DeformableConvBackwardData, 8);
  209. #undef DEF_PROF
  210. template <class Opr, int arity>
  211. struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> {
  212. using Base = OprProxyProfilingBase<Opr, arity>;
  213. void exec(Opr* opr, const TensorNDArray& tensors) {
  214. megdnn_assert(tensors.size() == arity);
  215. if (!Base::W.valid()) {
  216. Base::W = WorkspaceWrapper(opr->handle(), 0);
  217. }
  218. TensorLayoutArray layouts;
  219. for (auto&& tensor : tensors) {
  220. layouts.push_back(tensor.layout);
  221. }
  222. if (Base::m_profiling && !Base::target_algo_info.desc.valid()) {
  223. size_t min_time = std::numeric_limits<size_t>::max();
  224. for (auto algo :
  225. AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) {
  226. opr->execution_policy().algo = algo;
  227. auto preprocess_tensors =
  228. weight_prerocess(opr, tensors, algo.desc);
  229. megcoreSynchronize(opr->handle()->megcore_computing_handle());
  230. typename Opr::PreprocessedFilter preprocessed_filter{
  231. nullptr, *preprocess_tensors};
  232. auto workspace_size =
  233. AlgoProxy<Opr, arity>::get_workspace_in_bytes(
  234. opr, layouts, &preprocessed_filter);
  235. Base::W.update(workspace_size);
  236. for (size_t times = 0; times < Base::warmup_times; ++times) {
  237. AlgoProxy<Opr, arity>::exec(opr, tensors,
  238. &preprocessed_filter,
  239. Base::W.workspace());
  240. }
  241. megcoreSynchronize(opr->handle()->megcore_computing_handle());
  242. Timer timer;
  243. timer.start();
  244. for (size_t times = 0; times < Base::exec_times; ++times) {
  245. AlgoProxy<Opr, arity>::exec(opr, tensors,
  246. &preprocessed_filter,
  247. Base::W.workspace());
  248. }
  249. megcoreSynchronize(opr->handle()->megcore_computing_handle());
  250. timer.stop();
  251. printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
  252. algo.name.c_str());
  253. if (min_time > timer.get_time_in_us()) {
  254. min_time = timer.get_time_in_us();
  255. Base::target_algo_info = algo;
  256. }
  257. }
  258. opr->execution_policy().algo = Base::target_algo_info;
  259. auto preprocess_tensors =
  260. weight_prerocess(opr, tensors, Base::target_algo_info.desc);
  261. megcoreSynchronize(opr->handle()->megcore_computing_handle());
  262. typename Opr::PreprocessedFilter preprocessed_filter{
  263. nullptr, *preprocess_tensors};
  264. auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes(
  265. opr, layouts, &preprocessed_filter);
  266. Base::W.update(workspace_size);
  267. }
  268. auto preprocess_tensors =
  269. weight_prerocess(opr, tensors, Base::target_algo_info.desc);
  270. megcoreSynchronize(opr->handle()->megcore_computing_handle());
  271. typename Opr::PreprocessedFilter preprocessed_filter{
  272. nullptr, *preprocess_tensors};
  273. if (!Base::target_algo_info.valid()) {
  274. auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes(
  275. opr, layouts, &preprocessed_filter);
  276. Base::W.update(workspace_size);
  277. }
  278. AlgoProxy<Opr, arity>::exec(opr, tensors, &preprocessed_filter,
  279. Base::W.workspace());
  280. }
  281. //! handle weight preprocess
  282. std::shared_ptr<TensorNDArray> weight_prerocess(
  283. Opr* opr, const TensorNDArray& tensors,
  284. const typename Opr::AlgorithmDesc&) {
  285. TensorLayoutArray layouts;
  286. for (auto&& tensor : tensors) {
  287. layouts.push_back(tensor.layout);
  288. }
  289. auto weight_perprocess_layouts =
  290. AlgoProxy<Opr, arity>::deduce_preprocessed_filter_layout(
  291. opr, layouts);
  292. auto preprocessed_filter_tensors_ptr =
  293. Base::alloc_tensors(opr->handle(), weight_perprocess_layouts);
  294. typename Opr::PreprocessedFilter preprocessed_filter{
  295. nullptr, *preprocessed_filter_tensors_ptr};
  296. size_t preprocess_workspace_size =
  297. AlgoProxy<Opr, arity>::get_preprocess_workspace_in_bytes(
  298. opr, layouts);
  299. WorkspaceWrapper preprocess_workspace(opr->handle(),
  300. preprocess_workspace_size);
  301. AlgoProxy<Opr, arity>::exec_preprocess(
  302. opr, tensors, layouts, &preprocessed_filter,
  303. preprocess_workspace.workspace());
  304. return preprocessed_filter_tensors_ptr;
  305. }
  306. };
  307. #define DEF_PROF(c, arity) \
  308. template <> \
  309. struct OprWeightPreprocessProxy<c> \
  310. : public OprWeightPreprocessProxyImpl<c, arity> { \
  311. using OprWeightPreprocessProxyImpl< \
  312. c, arity>::OprWeightPreprocessProxyImpl; \
  313. }
  314. DEF_PROF(ConvolutionForward, 3);
  315. DEF_PROF(ConvBias, 5);
  316. #undef DEF_PROF
  317. } // namespace test
  318. } // namespace megdnn
  319. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台