From defbc20efb2ca851167e80e9b1f1f2d1aa60d86c Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Fri, 6 Aug 2021 14:48:55 +0800
Subject: [PATCH] fix(profiler): fix profiler in dtr

GitOrigin-RevId: 79b5a6b52615cbda6b4017cc9d34c39d62f09bca
---
 .../src/impl/interpreter/interpreter_impl.cpp      | 20 +++++++-----
 imperative/src/impl/interpreter/interpreter_impl.h |  5 +--
 imperative/src/impl/profiler/chrome_timeline.cpp   |  6 ++--
 imperative/src/impl/profiler/events.h              |  1 +
 imperative/src/impl/profiler/states.h              | 38 +++++++++++++++++-----
 5 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/imperative/src/impl/interpreter/interpreter_impl.cpp b/imperative/src/impl/interpreter/interpreter_impl.cpp
index 158e2942..48b23f9c 100644
--- a/imperative/src/impl/interpreter/interpreter_impl.cpp
+++ b/imperative/src/impl/interpreter/interpreter_impl.cpp
@@ -616,7 +616,7 @@ void ChannelImpl::release_tensor(TensorInfo* dest) {
 void ChannelImpl::regenerate(TensorInfo* dest) {
     if (dest->evict_type == EvictType::DROP) {
         auto &&path = dest->producer;
-        m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest});
+        m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest, "dtr"});
         if (!m_applying) flush_apply_stack();
     } else if (dest->evict_type == EvictType::SWAP) {
         MGB_RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen);
@@ -625,7 +625,7 @@ void ChannelImpl::regenerate(TensorInfo* dest) {
     }
 }
 
-void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
+void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) {
     using namespace ranges;
     using namespace ranges::views;
     auto& state = get_worker_state();
@@ -689,7 +689,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
         }
         return outputs;
     };
-    MGB_RECORD_EVENT(OpExecuteEvent, apply_id);
+    MGB_RECORD_EVENT(OpExecuteEvent, apply_id, {}, reason);
     // Begin profiling operator
     SmallVector<std::pair<CompNode, uint64_t>> kernels;
     if (profiling_device) {
@@ -769,7 +769,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
         }
         m_dtr.unpin(cmd.inputs);
     }
-    MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id);
+    MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id, {}, reason);
     // End profiling operator
 }
 
@@ -777,7 +777,7 @@ void ChannelImpl::flush_apply_stack() {
     m_applying = true;
     auto& state = get_worker_state();
     while (!m_apply_stack.empty()) {
-        auto& [cmd, idx, recomp] = m_apply_stack.top(); // cmd.inputs[0~idx-1] is in memory
+        auto& [cmd, idx, recomp, reason] = m_apply_stack.top(); // cmd.inputs[0~idx-1] is in memory
         if (idx == 0) {
             if (state.options.enable_dtr_auto_drop) {
                 m_dtr.pin(cmd.inputs);
@@ -801,10 +801,9 @@ void ChannelImpl::flush_apply_stack() {
         }
         if (regen) continue;
         // the required input tensors are already in memory
-        auto cmd_backup = cmd;
-        auto recomp_backup = recomp;
+        auto [cmd_backup, recomp_backup, reason_backup] = std::make_tuple(cmd, recomp, reason);
         m_apply_stack.pop();
-        do_apply_op(cmd_backup);
+        do_apply_op(cmd_backup, reason_backup);
         if (recomp_backup) {
             MGB_RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen);
             for (auto o : cmd_backup.outputs) {
@@ -829,6 +828,7 @@ bool ChannelImpl::auto_evict(size_t force_num) {
         sample_on_device(m_dtr.comp_node, false);
         auto best = m_dtr.find_best_tensor(state.options.enable_dtr_sqrt_sampling && !force_num);
         if (!best) {
+            MGB_RECORD_EVENT(AutoEvictFinishEvent);
             break;
         }
         if (best->ptr.unique() && best->ptr->blob().unique()) {
@@ -947,7 +947,9 @@ void ChannelImpl::alloc_tensor_with_evict(Blob* x) {
             set_log_level(pre_level);
             mgb_log_warn("reallocating all cuda memory to alleviate fragmentation, the performance may be affected");
             set_log_level(LogLevel::NO_LOG);
+            imperative_log_profile_begin("defrag");
             BlobManager::inst()->defrag(x->comp_node());
+            imperative_log_profile_end("defrag");
             BlobManager::inst()->alloc_direct(x, x->size());
         }
     });
@@ -1025,7 +1027,7 @@ void ChannelImpl::process_one_task(Command& icmd) {
                         return;
                     }
                 }
-                m_apply_stack.push({cmd, 0, nullptr});
+                m_apply_stack.push({cmd, 0, nullptr, "cmd"});
                 flush_apply_stack();
                 for (size_t i = 0; i < cmd.outputs.size(); ++i) {
                     auto output = cmd.outputs[i];
diff --git a/imperative/src/impl/interpreter/interpreter_impl.h b/imperative/src/impl/interpreter/interpreter_impl.h
index cc998bff..e39e08a9 100644
--- a/imperative/src/impl/interpreter/interpreter_impl.h
+++ b/imperative/src/impl/interpreter/interpreter_impl.h
@@ -104,8 +104,8 @@ private:
     void release_tensor(TensorInfo* dest);
 
     void regenerate(TensorInfo* dest);
-    void do_apply_op(const ApplyOp& cmd);
     void flush_apply_stack();
+    void do_apply_op(const ApplyOp& cmd, std::string reason);
     
     std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPtr>> init_output_and_workspace(
         const OpDef& def,
@@ -150,7 +150,8 @@ private:
     std::exception_ptr m_worker_exc;
     std::function<void(std::string, std::string)> m_profile_dump_callback;
     size_t m_storage_id = 0;
-    std::stack<std::tuple<ApplyOp, size_t, TensorInfo*>> m_apply_stack;
+    // TODO: use explicit struct
+    std::stack<std::tuple<ApplyOp, size_t, TensorInfo*, std::string>> m_apply_stack;
     bool m_applying = false;
     bool m_closed = false;
 
diff --git a/imperative/src/impl/profiler/chrome_timeline.cpp b/imperative/src/impl/profiler/chrome_timeline.cpp
index 64c1f43f..69e6cc21 100644
--- a/imperative/src/impl/profiler/chrome_timeline.cpp
+++ b/imperative/src/impl/profiler/chrome_timeline.cpp
@@ -200,13 +200,13 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
 
     ChromeTraceEvent& new_host_event(std::string name, char ph) {
         return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time));
-    };
+    }
 
     ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) {
         using namespace std::literals::chrono_literals;
         auto time = since_start(to_device_time(current->time, device));
         return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time);
-    };
+    }
 
     const char* to_cstr(TensorCommandKind kind) {
         switch(kind) {
@@ -241,14 +241,12 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
             new_host_event("OpDispatch", 'E').args(current_op->detail());
         } else if constexpr (std::is_same_v<TEvent, OpExecuteEvent>) {
             mgb_assert(event.op_id != 0);
-            current_op->execute_begin = current->time;
             new_host_event(current_op->name, 'B');
             new_host_event(pid_str, 't')
                     .cat("OpDispatch")
                     .id(current_op->id)
                     .scope(pid_str);
         } else if constexpr (std::is_same_v<TEvent, OpExecuteFinishEvent>) {
-            current_op->execute_end = current->time;
             new_host_event(current_op->name, 'E')
                     .args(current_op->detail());
         } else if constexpr (std::is_same_v<TEvent, KernelLaunchEvent>) {
diff --git a/imperative/src/impl/profiler/events.h b/imperative/src/impl/profiler/events.h
index dd034b32..62d55b04 100644
--- a/imperative/src/impl/profiler/events.h
+++ b/imperative/src/impl/profiler/events.h
@@ -84,6 +84,7 @@ DEF_DUR_EVENT(OpOutput, {
 DEF_DUR_EVENT(OpExecute, {
     uint64_t op_id;
     SmallVector<CompNode> device_list;
+    std::string reason;
 });
 
 DEF_DUR_EVENT(KernelLaunch, {
diff --git a/imperative/src/impl/profiler/states.h b/imperative/src/impl/profiler/states.h
index 0c758637..7aade873 100644
--- a/imperative/src/impl/profiler/states.h
+++ b/imperative/src/impl/profiler/states.h
@@ -62,8 +62,13 @@ struct ProfileOperatorState {
     CompNode device;
     Trace trace;
 
-    profiler::HostTime execute_begin;
-    profiler::HostTime execute_end;
+    struct Execution {
+        std::string reason;
+        profiler::HostTime begin;
+        profiler::HostTime end;
+    };
+
+    SmallVector<Execution> executions;
 
     nlohmann::json detail() {
         nlohmann::json args;
@@ -285,7 +290,7 @@ public:
                 op.outputs = event.outputs;
                 op.trace = event.trace;
                 for (auto&& output: event.outputs) {
-                    m_tensors.at(output).source = op.id;
+                    m_tensors[output].source = op.id;
                 }
             } else if constexpr (std::is_same_v<T, TensorDeclareEvent>) {
                 auto& tensor = m_tensors[event.tensor_id];
@@ -293,7 +298,7 @@ public:
                 tensor.id = event.tensor_id;
                 tensor.name = event.name;
             } else if constexpr (std::is_same_v<T, TensorProduceEvent>) {
-                auto& tensor = m_tensors.at(event.tensor_id);
+                auto& tensor = m_tensors[event.tensor_id];
                 if (!m_device_tid_table.count(event.device)) {
                     m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()};
                 }
@@ -308,15 +313,24 @@ public:
             using T = std::decay_t<decltype(event)>;
             // update current_op/tensor
             if constexpr (is_op_event<T>::value) {
-                current_op = &m_operators.at(event.op_id);
+                current_op = &m_operators[event.op_id];
+                if (current_op->id == 0) {
+                    current_op->id = event.op_id;
+                    current_op->name = "UnknownOperator";
+                }
             } else if constexpr (is_tensor_event<T>::value) {
-                mgb_assert(m_tensors.count(event.tensor_id) != 0, "tensor not found");
-                current_tensor = &m_tensors.at(event.tensor_id);
+                current_tensor = &m_tensors[event.tensor_id];
+                if (current_tensor->id == 0) {
+                    current_tensor->id = event.tensor_id;
+                    current_tensor->name = "UnknownTensor";
+                }
             }
             if constexpr (std::is_same_v<T, OpExecuteEvent>) {
-                current_op->execute_begin = current->time;
+                current_op->executions.emplace_back();
+                current_op->executions.back().reason = event.reason;
+                current_op->executions.back().begin = current->time;
             } else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) {
-                current_op->execute_end = current->time;
+                current_op->executions.back().end = current->time;
             }
             // update counters
             if constexpr (std::is_same_v<T, OpDispatchEvent>) {
@@ -337,6 +351,12 @@ public:
                 }
             } else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) {
                 inc_counter("nr_exception", 1);
+            } else if constexpr (std::is_same_v<T, KernelLaunchFinishEvent>) {
+                auto& execution = current_op->executions.back();
+                if (execution.reason == "dtr") {
+                    auto overhead = to_device_time(current->time, event.device) - to_device_time(execution.begin, event.device);
+                    inc_counter("dtr_overhead_us", std::chrono::duration_cast<std::chrono::microseconds>(overhead).count());
+                }
             }
             // visit_event_impl
             self.visit_event(event);