You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiler.cpp 8.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. /**
  2. * \file imperative/src/impl/profiler.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/imperative/profiler.h"
  12. #include "./function_hook.h"
  13. #include "megbrain/imperative/ops/opr_attr.h"
  14. #include "megbrain/imperative/physical_tensor.h"
  15. #include "megbrain/plugin/opr_footprint.h"
  16. #include "./event_pool.h"
  17. #include "./op_trait.h"
  18. namespace mgb {
  19. namespace imperative {
  20. namespace {
  21. CompNode::UnorderedSet collect_comp_nodes(
  22. const OpDef& def, const SmallVector<TensorPtr>& inputs) {
  23. CompNode::UnorderedSet comp_nodes;
  24. for (auto&& input : inputs) {
  25. comp_nodes.insert(input->comp_node());
  26. }
  27. for (auto&& output_attr : def.infer_output_attrs(def, inputs)) {
  28. comp_nodes.insert(output_attr.comp_node);
  29. }
  30. return comp_nodes;
  31. }
  32. DeviceTimer::SharedEvent alloc_recorded_event(CompNode device) {
  33. auto event = EventPool::with_timer().alloc_shared(device);
  34. event->record();
  35. return event;
  36. }
  37. OprFootprint footprint{};
  38. } // namespace
  39. void DeviceTimer::reset(thin_function<double()> host_timer) {
  40. CompNode::foreach ([this, host_timer](CompNode device) {
  41. m_base_event_table[device] = {alloc_recorded_event(device), host_timer()};
  42. });
  43. m_host_timer = host_timer;
  44. }
  45. thin_function<double()> DeviceTimer::get_device_time(CompNode device) {
  46. auto event = EventPool::with_timer().alloc_shared(device);
  47. event->record();
  48. if(m_base_event_table.count(device) == 0) {
  49. m_base_event_table[device] = {alloc_recorded_event(device), m_host_timer()};
  50. }
  51. auto base = m_base_event_table[device];
  52. return [base, event] {
  53. auto [base_event, host_time] = base;
  54. // TODO: sync once for each compnode
  55. event->host_wait();
  56. return base_event->elapsed_time_until(*event) * 1000 + host_time;
  57. };
  58. }
  59. void DeviceTimer::clear() {
  60. m_base_event_table.clear();
  61. }
  62. size_t TensorRecorder::record_tensor(const TensorPtr& tensor) {
  63. if (m_tensor_map.count(tensor.get()) > 0) {
  64. auto& [prev, id] = m_tensor_map[tensor.get()];
  65. if (prev.lock() != tensor) {
  66. prev = tensor;
  67. id = m_next_id++;
  68. }
  69. return id;
  70. } else {
  71. auto id = m_next_id++;
  72. m_tensor_map.insert({tensor.get(), {std::weak_ptr{tensor}, id}});
  73. return id;
  74. }
  75. }
  76. void TensorRecorder::clear() {
  77. m_next_id = 0;
  78. m_tensor_map.clear();
  79. }
  80. Profile& Profiler::get_profile() {
  81. for (auto& entry : m_profile) {
  82. for (auto& [device, device_begin, device_end] : entry.device_list) {
  83. MGB_MARK_USED_VAR(device);
  84. device_begin = [value = device_begin()] { return value; };
  85. device_end = [value = device_end()] { return value; };
  86. }
  87. }
  88. return m_profile;
  89. }
  90. void Profiler::start(uint32_t flags) {
  91. m_host_timer.reset();
  92. m_device_timer.reset([&] { return m_host_timer.get_msecs(); });
  93. OpTrait::for_each_trait([this, flags](OpTrait& trait) {
  94. auto hook_apply_on_physical_tensor =
  95. make_shared_hook(&trait.apply_on_physical_tensor);
  96. auto hook_apply_on_var_node =
  97. make_shared_hook(&trait.apply_on_var_node);
  98. hook_apply_on_physical_tensor->apply_hook([this, flags]
  99. (auto&& apply, const OpDef& def, const SmallVector<TensorPtr>& inputs) {
  100. auto shape2vector = [](const TensorShape& shape) {
  101. std::vector<size_t> vector_shape;
  102. for (size_t i = 0; i < shape.ndim; i++) {
  103. vector_shape.push_back(shape[i]);
  104. }
  105. return vector_shape;
  106. };
  107. ProfileEntry entry;
  108. entry.id = m_entry_count++;
  109. // TODO: assign parent
  110. entry.parent = 0;
  111. // Record apply context and save to m_profile
  112. entry.op = def.copy();
  113. for (auto&& input : inputs) {
  114. entry.inputs.push_back({m_tensor_recorder.record_tensor(input),
  115. shape2vector(input->layout()),
  116. input->comp_node()});
  117. }
  118. double host_begin = m_host_timer.get_msecs();
  119. auto&& comp_nodes = collect_comp_nodes(def, inputs);
  120. for (auto&& comp_node : comp_nodes) {
  121. entry.device_list.push_back(
  122. {comp_node,
  123. m_device_timer.get_device_time(comp_node),
  124. {}});
  125. }
  126. if (flags & PROFILE_FOOTPRINT) {
  127. MGB_LOCK_GUARD(m_lock);
  128. m_entry_stack.push({&def, &entry, std::this_thread::get_id()});
  129. }
  130. // Do real apply
  131. auto outputs = apply(def, inputs);
  132. for (auto& [cn, dev_begin, dev_end] : entry.device_list) {
  133. MGB_MARK_USED_VAR(cn);
  134. MGB_MARK_USED_VAR(dev_begin);
  135. dev_end = m_device_timer.get_device_time(cn);
  136. }
  137. entry.host = {host_begin, m_host_timer.get_msecs()};
  138. for (auto&& output : outputs) {
  139. entry.outputs.push_back(
  140. {m_tensor_recorder.record_tensor(output),
  141. shape2vector(output->layout()), output->comp_node()});
  142. }
  143. if (flags & PROFILE_FOOTPRINT) {
  144. mgb_assert(std::get<1>(m_entry_stack.top()) == &entry);
  145. MGB_LOCK_GUARD(m_lock);
  146. m_entry_stack.pop();
  147. }
  148. m_profile.push_back(std::move(entry));
  149. return outputs;
  150. });
  151. if (flags & PROFILE_FOOTPRINT) {
  152. hook_apply_on_var_node->apply_hook(
  153. [this](auto&& apply, const OpDef& def,
  154. VarNodeArray inputs) -> cg::OperatorNodeBase* {
  155. auto* operator_node = apply(def, std::move(inputs));
  156. std::remove_reference_t<decltype(m_entry_stack.top())>
  157. top;
  158. {
  159. MGB_LOCK_GUARD(m_lock);
  160. if (m_entry_stack.empty()) {
  161. return operator_node;
  162. }
  163. top = m_entry_stack.top();
  164. }
  165. auto [current_op, current_entry, thread_id] = top;
  166. if (current_op != &def ||
  167. thread_id != std::this_thread::get_id()) {
  168. return operator_node;
  169. }
  170. auto&& footprint_result =
  171. footprint.calc_footprint(operator_node);
  172. current_entry->memory = footprint_result.memory;
  173. current_entry->computation =
  174. footprint_result.computation;
  175. #if MGB_ENABLE_JSON
  176. current_entry->param = footprint_result.param;
  177. #endif
  178. return operator_node;
  179. });
  180. }
  181. m_hooker_list.push_back(std::move(hook_apply_on_physical_tensor));
  182. m_hooker_list.push_back(std::move(hook_apply_on_var_node));
  183. });
  184. }
  185. void Profiler::stop() {
  186. m_hooker_list.clear();
  187. for (auto& entry : m_profile) {
  188. entry.wait_device();
  189. }
  190. }
  191. void Profiler::clear() {
  192. mgb_assert(m_entry_stack.empty(),
  193. "entry_stack should be empty after profile");
  194. mgb_assert(m_hooker_list.empty(), "hooks should be released");
  195. m_profile.clear();
  196. m_entry_count = 0;
  197. m_device_timer.clear();
  198. m_tensor_recorder.clear();
  199. }
  200. } // namespace imperative
  201. } // namespace mgb

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台