From defbc20efb2ca851167e80e9b1f1f2d1aa60d86c Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 6 Aug 2021 14:48:55 +0800 Subject: [PATCH] fix(profiler): fix profiler in dtr GitOrigin-RevId: 79b5a6b52615cbda6b4017cc9d34c39d62f09bca --- .../src/impl/interpreter/interpreter_impl.cpp | 20 +++++++----- imperative/src/impl/interpreter/interpreter_impl.h | 5 +-- imperative/src/impl/profiler/chrome_timeline.cpp | 6 ++-- imperative/src/impl/profiler/events.h | 1 + imperative/src/impl/profiler/states.h | 38 +++++++++++++++++----- 5 files changed, 46 insertions(+), 24 deletions(-) diff --git a/imperative/src/impl/interpreter/interpreter_impl.cpp b/imperative/src/impl/interpreter/interpreter_impl.cpp index 158e2942..48b23f9c 100644 --- a/imperative/src/impl/interpreter/interpreter_impl.cpp +++ b/imperative/src/impl/interpreter/interpreter_impl.cpp @@ -616,7 +616,7 @@ void ChannelImpl::release_tensor(TensorInfo* dest) { void ChannelImpl::regenerate(TensorInfo* dest) { if (dest->evict_type == EvictType::DROP) { auto &&path = dest->producer; - m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest}); + m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest, "dtr"}); if (!m_applying) flush_apply_stack(); } else if (dest->evict_type == EvictType::SWAP) { MGB_RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen); @@ -625,7 +625,7 @@ void ChannelImpl::regenerate(TensorInfo* dest) { } } -void ChannelImpl::do_apply_op(const ApplyOp& cmd) { +void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) { using namespace ranges; using namespace ranges::views; auto& state = get_worker_state(); @@ -689,7 +689,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { } return outputs; }; - MGB_RECORD_EVENT(OpExecuteEvent, apply_id); + MGB_RECORD_EVENT(OpExecuteEvent, apply_id, {}, reason); // Begin profiling operator SmallVector> kernels; if (profiling_device) { @@ -769,7 +769,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { } m_dtr.unpin(cmd.inputs); } - MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id); + MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id, {}, reason); // End profiling operator } @@ -777,7 +777,7 @@ void ChannelImpl::flush_apply_stack() { m_applying = true; auto& state = get_worker_state(); while (!m_apply_stack.empty()) { - auto& [cmd, idx, recomp] = m_apply_stack.top(); // cmd.inputs[0~idx-1] is in memory + auto& [cmd, idx, recomp, reason] = m_apply_stack.top(); // cmd.inputs[0~idx-1] is in memory if (idx == 0) { if (state.options.enable_dtr_auto_drop) { m_dtr.pin(cmd.inputs); @@ -801,10 +801,9 @@ void ChannelImpl::flush_apply_stack() { } if (regen) continue; // the required input tensors are already in memory - auto cmd_backup = cmd; - auto recomp_backup = recomp; + auto [cmd_backup, recomp_backup, reason_backup] = std::make_tuple(cmd, recomp, reason); m_apply_stack.pop(); - do_apply_op(cmd_backup); + do_apply_op(cmd_backup, reason_backup); if (recomp_backup) { MGB_RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen); for (auto o : cmd_backup.outputs) { @@ -829,6 +828,7 @@ bool ChannelImpl::auto_evict(size_t force_num) { sample_on_device(m_dtr.comp_node, false); auto best = m_dtr.find_best_tensor(state.options.enable_dtr_sqrt_sampling && !force_num); if (!best) { + MGB_RECORD_EVENT(AutoEvictFinishEvent); break; } if (best->ptr.unique() && best->ptr->blob().unique()) { @@ -947,7 +947,9 @@ void ChannelImpl::alloc_tensor_with_evict(Blob* x) { set_log_level(pre_level); mgb_log_warn("reallocating all cuda memory to alleviate fragmentation, the performance may be affected"); set_log_level(LogLevel::NO_LOG); + imperative_log_profile_begin("defrag"); BlobManager::inst()->defrag(x->comp_node()); + imperative_log_profile_end("defrag"); BlobManager::inst()->alloc_direct(x, x->size()); } }); @@ -1025,7 +1027,7 @@ void ChannelImpl::process_one_task(Command& icmd) { return; } } - m_apply_stack.push({cmd, 0, nullptr}); + m_apply_stack.push({cmd, 0, nullptr, "cmd"}); flush_apply_stack(); for (size_t i = 0; i < cmd.outputs.size(); ++i) { auto output = cmd.outputs[i]; diff --git a/imperative/src/impl/interpreter/interpreter_impl.h b/imperative/src/impl/interpreter/interpreter_impl.h index cc998bff..e39e08a9 100644 --- a/imperative/src/impl/interpreter/interpreter_impl.h +++ b/imperative/src/impl/interpreter/interpreter_impl.h @@ -104,8 +104,8 @@ private: void release_tensor(TensorInfo* dest); void regenerate(TensorInfo* dest); - void do_apply_op(const ApplyOp& cmd); void flush_apply_stack(); + void do_apply_op(const ApplyOp& cmd, std::string reason); std::tuple, SmallVector, SmallVector> init_output_and_workspace( const OpDef& def, @@ -150,7 +150,8 @@ private: std::exception_ptr m_worker_exc; std::function m_profile_dump_callback; size_t m_storage_id = 0; - std::stack> m_apply_stack; + // TODO: use explicit struct + std::stack> m_apply_stack; bool m_applying = false; bool m_closed = false; diff --git a/imperative/src/impl/profiler/chrome_timeline.cpp b/imperative/src/impl/profiler/chrome_timeline.cpp index 64c1f43f..69e6cc21 100644 --- a/imperative/src/impl/profiler/chrome_timeline.cpp +++ b/imperative/src/impl/profiler/chrome_timeline.cpp @@ -200,13 +200,13 @@ struct ChromeTimelineEventVisitor: EventVisitor { ChromeTraceEvent& new_host_event(std::string name, char ph) { return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time)); - }; + } ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { using namespace std::literals::chrono_literals; auto time = since_start(to_device_time(current->time, device)); return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time); - }; + } const char* to_cstr(TensorCommandKind kind) { switch(kind) { @@ -241,14 +241,12 @@ struct ChromeTimelineEventVisitor: EventVisitor { new_host_event("OpDispatch", 'E').args(current_op->detail()); } else if constexpr (std::is_same_v) { mgb_assert(event.op_id != 0); - current_op->execute_begin = current->time; new_host_event(current_op->name, 'B'); new_host_event(pid_str, 't') .cat("OpDispatch") .id(current_op->id) .scope(pid_str); } else if constexpr (std::is_same_v) { - current_op->execute_end = current->time; new_host_event(current_op->name, 'E') .args(current_op->detail()); } else if constexpr (std::is_same_v) { diff --git a/imperative/src/impl/profiler/events.h b/imperative/src/impl/profiler/events.h index dd034b32..62d55b04 100644 --- a/imperative/src/impl/profiler/events.h +++ b/imperative/src/impl/profiler/events.h @@ -84,6 +84,7 @@ DEF_DUR_EVENT(OpOutput, { DEF_DUR_EVENT(OpExecute, { uint64_t op_id; SmallVector device_list; + std::string reason; }); DEF_DUR_EVENT(KernelLaunch, { diff --git a/imperative/src/impl/profiler/states.h b/imperative/src/impl/profiler/states.h index 0c758637..7aade873 100644 --- a/imperative/src/impl/profiler/states.h +++ b/imperative/src/impl/profiler/states.h @@ -62,8 +62,13 @@ struct ProfileOperatorState { CompNode device; Trace trace; - profiler::HostTime execute_begin; - profiler::HostTime execute_end; + struct Execution { + std::string reason; + profiler::HostTime begin; + profiler::HostTime end; + }; + + SmallVector executions; nlohmann::json detail() { nlohmann::json args; @@ -285,7 +290,7 @@ public: op.outputs = event.outputs; op.trace = event.trace; for (auto&& output: event.outputs) { - m_tensors.at(output).source = op.id; + m_tensors[output].source = op.id; } } else if constexpr (std::is_same_v) { auto& tensor = m_tensors[event.tensor_id]; @@ -293,7 +298,7 @@ public: tensor.id = event.tensor_id; tensor.name = event.name; } else if constexpr (std::is_same_v) { - auto& tensor = m_tensors.at(event.tensor_id); + auto& tensor = m_tensors[event.tensor_id]; if (!m_device_tid_table.count(event.device)) { m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; } @@ -308,15 +313,24 @@ public: using T = std::decay_t; // update current_op/tensor if constexpr (is_op_event::value) { - current_op = &m_operators.at(event.op_id); + current_op = &m_operators[event.op_id]; + if (current_op->id == 0) { + current_op->id = event.op_id; + current_op->name = "UnknownOperator"; + } } else if constexpr (is_tensor_event::value) { - mgb_assert(m_tensors.count(event.tensor_id) != 0, "tensor not found"); - current_tensor = &m_tensors.at(event.tensor_id); + current_tensor = &m_tensors[event.tensor_id]; + if (current_tensor->id == 0) { + current_tensor->id = event.tensor_id; + current_tensor->name = "UnknownTensor"; + } } if constexpr (std::is_same_v) { - current_op->execute_begin = current->time; + current_op->executions.emplace_back(); + current_op->executions.back().reason = event.reason; + current_op->executions.back().begin = current->time; } else if constexpr (std::is_same_v) { - current_op->execute_end = current->time; + current_op->executions.back().end = current->time; } // update counters if constexpr (std::is_same_v) { @@ -337,6 +351,12 @@ public: } } else if constexpr (std::is_same_v) { inc_counter("nr_exception", 1); + } else if constexpr (std::is_same_v) { + auto& execution = current_op->executions.back(); + if (execution.reason == "dtr") { + auto overhead = to_device_time(current->time, event.device) - to_device_time(execution.begin, event.device); + inc_counter("dtr_overhead_us", std::chrono::duration_cast(overhead).count()); + } } // visit_event_impl self.visit_event(event);