You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiler.cpp 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. /**
  2. * \file src/opr/impl/search_policy/profile.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megbrain/opr/search_policy/profiler.h"
  13. #include "../internal/invoke.h"
  14. #include "../internal/megdnn_opr_wrapper.inl"
  15. #if MGB_ROCM
  16. #include "hcc_detail/hcc_defs_prologue.h"
  17. #include "megcore_rocm.h"
  18. #endif
  19. //! TODO: here has to be know some megdnn::opr when there is produced midout.h
  20. //! fix it if there is another graceful way.
  21. #include "megdnn/oprs.h"
  22. #include "midout.h"
  23. MIDOUT_DECL(megbrain_opr_profile)
  24. #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) {
  25. #define MIDOUT_E \
  26. } \
  27. MIDOUT_END();
  28. namespace mgb {
  29. namespace opr {
  30. #define APPLY(statement, ...) \
  31. mgb::apply([&](const auto&... args) { return statement; }, \
  32. std::tuple_cat(__VA_ARGS__))
  33. template <typename Opr>
  34. const double TimedProfiler<Opr>::timeout_setting =
  35. TimedProfiler<Opr>::init_timeout_setting();
  36. template <typename Opr>
  37. double TimedProfiler<Opr>::init_timeout_setting() {
  38. #if MGB_ENABLE_FASTRUN
  39. sys::TimedFuncInvoker::ins().register_func(
  40. AlgoChooserFuncId<Opr>::ID, &TimedProfiler<Opr>::prof_impl,
  41. &TimedProfiler<Opr>::prof_init_device);
  42. auto to_set = MGB_GETENV("MGB_CONV_PROFILING_TIMEOUT");
  43. if (to_set)
  44. return std::stod(to_set);
  45. #endif
  46. return 0;
  47. }
  48. #define APPLY(statement, ...) \
  49. mgb::apply([&](const auto&... args) { return statement; }, \
  50. std::tuple_cat(__VA_ARGS__))
  51. template <typename Opr>
  52. typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
  53. const TParam& raw_param) {
  54. MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_impl")))
  55. #if MGB_ROCM
  56. bool miopen_algo_search_enabled;
  57. megcore::getMIOpenAlgoSearchStatus(&miopen_algo_search_enabled);
  58. mgb_assert(miopen_algo_search_enabled, "MIOpen algo search not enabled");
  59. #endif
  60. auto&& param = raw_param.as_single_pod<Param>();
  61. CompNode cn = CompNode::load(param.comp_node_loc, param.comp_node_loc);
  62. auto megdnn_opr = intl::create_megdnn_opr<Opr>(cn);
  63. std::array<TensorLayout, arity> layouts;
  64. auto from_enum = [&](DTypeEnum enumv) -> DType {
  65. switch (enumv) {
  66. #define cb(_dt) \
  67. case DTypeTrait<_dt>::enumv: \
  68. return _dt(1.0f, static_cast<uint8_t>(0))
  69. cb(dtype::Quantized8Asymm);
  70. #undef cb
  71. #define cb(_dt) \
  72. case DTypeTrait<_dt>::enumv: \
  73. return _dt(1.0f)
  74. cb(dtype::QuantizedS8);
  75. cb(dtype::QuantizedS16);
  76. cb(dtype::QuantizedS32);
  77. default:
  78. return DType::from_enum(enumv);
  79. #undef cb
  80. }
  81. };
  82. for (int i = 0; i < arity; ++i) {
  83. layouts[i] = {param.shapes[i], from_enum(param.dtypes[i])};
  84. }
  85. megdnn_opr->param() = param.opr_param;
  86. {
  87. typename Opr::AlgorithmInfo algo;
  88. for (auto i :
  89. APPLY(megdnn_opr->get_all_algorithms_info(args...), layouts)) {
  90. if (!strcmp(i.name.c_str(), param.algo_name)) {
  91. algo = i;
  92. break;
  93. }
  94. }
  95. mgb_assert(algo.valid(), "algorithm %s not found", param.algo_name);
  96. megdnn_opr->execution_policy() = {algo};
  97. }
  98. // Allocate preprocessed weight buffers.
  99. TensorLayoutArray preprocessed_layout;
  100. if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
  101. if (param.allow_weight_preprocess) {
  102. preprocessed_layout = APPLY(
  103. _(megdnn_opr)->deduce_preprocessed_filter_layout(args...),
  104. layouts);
  105. }
  106. });
  107. {
  108. // first allocate a whole chunk to avoid memory fragmentation (here we
  109. // rely on memory allocator to reuse memory)
  110. auto align = cn.get_mem_addr_alignment();
  111. size_t tot_size = align;
  112. for (int i = 0; i < arity; ++i) {
  113. tot_size += layouts[i].span().high_byte + align;
  114. }
  115. for (const auto& layout : preprocessed_layout) {
  116. tot_size += layout.span().high_byte + align;
  117. }
  118. tot_size += param.workspace;
  119. DeviceTensorStorage storage{cn};
  120. storage.ensure_size(tot_size);
  121. }
  122. // allocate input and output memory
  123. std::array<DeviceTensorND, arity_in> inp_val;
  124. std::array<DeviceTensorND, arity_out> out_val;
  125. DeviceTensorND workspace;
  126. for (int i = 0; i < arity_in; ++i) {
  127. inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]);
  128. }
  129. for (int i = 0; i < arity_out; ++i) {
  130. out_val[i]
  131. .comp_node(cn)
  132. .dtype(layouts[arity_in + i].dtype)
  133. .resize(layouts[arity_in + i]);
  134. }
  135. megdnn::Workspace mdn_workspace;
  136. // allocate workspace
  137. if (param.workspace) {
  138. workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace});
  139. mdn_workspace.size = param.workspace;
  140. mdn_workspace.raw_ptr = workspace.raw_ptr();
  141. }
  142. // allocate storage for preprocessed filter
  143. SmallVector<DeviceTensorND> flt_val(preprocessed_layout.size());
  144. for (size_t i = 0; i < preprocessed_layout.size(); i++) {
  145. flt_val[i] = {cn, preprocessed_layout[i], preprocessed_layout[i].dtype,
  146. preprocessed_layout[i].format};
  147. }
  148. for (int i = 0; i < arity_in; ++i) {
  149. fill_zero_dev_tensor(inp_val[i]);
  150. }
  151. PreprocessFilter<Opr> prep_flt;
  152. if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
  153. if (!preprocessed_layout.empty()) {
  154. auto&& pf = _(prep_flt);
  155. pf.algorithm_id = nullptr;
  156. pf.tensors.resize(flt_val.size());
  157. for (size_t i = 0; i < flt_val.size(); i++) {
  158. pf.tensors[i] = flt_val[i].as_megdnn();
  159. }
  160. if_constexpr<opr_contain_bias<Opr>()>(
  161. //! convbias
  162. [&](auto __) {
  163. APPLY(__(megdnn_opr)
  164. ->exec_preprocess(args..., &pf,
  165. mdn_workspace),
  166. std::forward_as_tuple(layouts[0],
  167. inp_val[1].as_megdnn(),
  168. inp_val[2].as_megdnn()),
  169. array_skip<arity_in - 1>(layouts));
  170. },
  171. //! Convolution
  172. [&](auto __) {
  173. APPLY(__(megdnn_opr)
  174. ->exec_preprocess(args..., &pf,
  175. mdn_workspace),
  176. std::forward_as_tuple(layouts[0],
  177. inp_val[1].as_megdnn()),
  178. array_skip<2>(layouts));
  179. });
  180. }
  181. });
  182. RealTimer timer;
  183. auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER),
  184. ev_end = cn.create_event(CompNode::Event::NEED_TIMER);
  185. ev_start->record();
  186. if_constexpr<opr_supports_preprocess<Opr>()>(
  187. [&](auto _) {
  188. auto&& opr = _(megdnn_opr);
  189. PreprocessFilter<Opr>* pf =
  190. preprocessed_layout.empty() ? nullptr : &prep_flt;
  191. APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace),
  192. inp_val, out_val);
  193. },
  194. /* else */
  195. [&](auto _) {
  196. APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace),
  197. inp_val, out_val);
  198. });
  199. ev_end->record();
  200. double next_report_time = 0.5;
  201. while (!ev_end->finished()) {
  202. if (timer.get_secs() >= next_report_time) {
  203. mgb_log_warn(
  204. "profiling conv algo %s already took %.3f/%.3f secs"
  205. " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
  206. param.algo_name, timer.get_secs(), param.actual_timeout);
  207. next_report_time = timer.get_secs() + 1;
  208. }
  209. using namespace std::literals;
  210. std::this_thread::sleep_for(1000us);
  211. }
  212. // release all free blocks owned by child process,
  213. // in order to avoid main process running out of memory
  214. cn.try_coalesce_all_free_memory();
  215. mgb_assert(ev_start->finished());
  216. return TResult::from_pod(Result{ev_start->elapsed_time_until(*ev_end)});
  217. MIDOUT_E
  218. };
  219. template <typename Opr>
  220. Maybe<typename TimedProfiler<Opr>::Result> TimedProfiler<Opr>::profile(
  221. const Param& param, double& timeout) {
  222. mgb_assert(timeout >= 0);
  223. if (!timeout) {
  224. timeout = timeout_setting;
  225. } else if (timeout_setting) {
  226. timeout = std::min(timeout, timeout_setting);
  227. }
  228. param.actual_timeout =
  229. timeout ? timeout : std::numeric_limits<double>::infinity();
  230. auto res = sys::TimedFuncInvoker::ins().invoke(
  231. AlgoChooserFuncId<Opr>::ID,
  232. TParam::from_pod(const_cast<Param&>(param)), timeout);
  233. if (res.valid())
  234. return res.val().template as_single_pod<Result>();
  235. return None;
  236. }
  237. template <typename Opr>
  238. void TimedProfiler<Opr>::prof_init_device(const TParam& raw_param) {
  239. MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_init_device")))
  240. #if MGB_ROCM
  241. megcore::enableMIOpenAlgoSearch(true);
  242. #endif
  243. auto&& param = raw_param.as_single_pod<Param>();
  244. CompNode cn = CompNode::load(param.comp_node_loc, param.comp_node_loc);
  245. // wait for cuda init, so its time does not get accounted in timeout
  246. cn.sync();
  247. MIDOUT_E
  248. }
  249. #define INST(Opr) \
  250. template const double TimedProfiler<megdnn::Opr>::timeout_setting; \
  251. template double TimedProfiler<megdnn::Opr>::init_timeout_setting(); \
  252. template typename TimedProfiler<megdnn::Opr>::TResult \
  253. TimedProfiler<megdnn::Opr>::prof_impl(const TParam& raw_param); \
  254. template Maybe<typename TimedProfiler<megdnn::Opr>::Result> \
  255. TimedProfiler<megdnn::Opr>::profile(const Param& param, double& timeout); \
  256. template void TimedProfiler<megdnn::Opr>::prof_init_device( \
  257. const TParam& raw_param);
  258. MGB_FOREACH_FASTRUN_OPR(INST)
  259. #undef INST
  260. } // namespace opr
  261. } // namespace mgb
  262. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台