From 9d47c3babdec5077548e161be790db23a7e60508 Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Mon, 26 Apr 2021 13:26:01 +0800
Subject: [PATCH] feat(profiler): imperative profiler support tracing

GitOrigin-RevId: b247472feba6d28416f52874c3517e50a8c2bd49
---
 imperative/python/megengine/jit/tracing.py         |  17 +-
 imperative/python/src/graph_rt.cpp                 |   6 +-
 imperative/src/impl/profiler_plugin.cpp            | 207 +++++++++++++++++++++
 .../include/megbrain/imperative/profiler_plugin.h  |  46 +++++
 4 files changed, 272 insertions(+), 4 deletions(-)
 create mode 100644 imperative/src/impl/profiler_plugin.cpp
 create mode 100644 imperative/src/include/megbrain/imperative/profiler_plugin.h

diff --git a/imperative/python/megengine/jit/tracing.py b/imperative/python/megengine/jit/tracing.py
index 31999f1c..8f617364 100644
--- a/imperative/python/megengine/jit/tracing.py
+++ b/imperative/python/megengine/jit/tracing.py
@@ -17,7 +17,7 @@ from typing import Any
 
 import numpy as np
 
-from ..core._imperative_rt import GraphProfiler, SerializationMetadata
+from ..core._imperative_rt import GraphProfiler, GraphProfiler2, SerializationMetadata
 from ..core._imperative_rt.core2 import Tensor as RawTensor
 from ..core._imperative_rt.core2 import (
     TensorWeakRef,
@@ -39,6 +39,7 @@ from ..core.ops.special import Const
 from ..core.tensor import megbrain_graph as G
 from ..core.tensor.utils import setscalar
 from ..utils.naming import AutoNaming
+from ..utils.profiler import is_profiling
 from .dtr_config import DTRConfig
 from .graph_opt_config import GraphOptimizationConfig
 from .sublinear_memory_config import SublinearMemoryConfig
@@ -160,6 +161,7 @@ class trace:
         self._dtr_config = dtr_config
         self._profiling = profiling
         self._profiler = None
+        self._profiler2 = None
         self._graph_opt_level = opt_level
         self._graph_opt_config = graph_opt_config
         self._symbolic_shape = symbolic_shape
@@ -382,7 +384,8 @@ class trace:
         lazy_eval_graph.options.graph_opt_level = self._graph_opt_level
         lazy_eval_graph._set_priority_to_id([*lazy_eval_links, *readers])
         lazy_eval_graph.compile(*lazy_eval_links, *readers)
-        lazy_eval_graph()
+        self._execute_graph(lazy_eval_graph)
+        lazy_eval_graph.wait()
         for r, x in zip(readers, lazy_eval_tensors):
             # get values from lazy_eval_graph and assign to lazy_eval tensor
             x._handle = RawTensor(r.op.get_value())._handle
@@ -401,7 +404,7 @@ class trace:
             else:
                 if self._graph is None:
                     self._compile()
-                self._graph.execute()
+                self._execute_graph(self._graph)
 
         def do_finalize():
             escaped_tensors = self._take_escaped_tensors()
@@ -532,9 +535,17 @@ class trace:
         # profile
         if self._profiling:
             self._profiler = GraphProfiler(graph)
+        self._profiler2 = None
         if int(os.getenv("MEGENGINE_INPLACE_UPDATE", "0")):
             graph.options.var_sanity_check_first_run = False
 
+    def _execute_graph(self, graph: G.Graph, *args):
+        if is_profiling() and (self._profiler2 is None):
+            self._profiler2 = GraphProfiler2(graph)
+        elif not is_profiling() and (self._profiler2 is not None):
+            self._profiler2 = None
+        graph.execute(*args)
+
     def _compile(self):
         graph = self._graph = G.Graph()
         graph.options.async_exec_level = 0b100
diff --git a/imperative/python/src/graph_rt.cpp b/imperative/python/src/graph_rt.cpp
index 837bc91f..b6fb0c20 100644
--- a/imperative/python/src/graph_rt.cpp
+++ b/imperative/python/src/graph_rt.cpp
@@ -23,7 +23,7 @@
 #include "./common.h"
 #include "./ops.h"
 #include "megbrain/gopt/inference.h"
-
+#include "megbrain/imperative/profiler_plugin.h"
 
 namespace py = pybind11;
 
@@ -239,6 +239,10 @@ void init_graph_rt(py::module m) {
                 }))
         .def("get", [](_CompGraphProfilerImpl& profiler) { return profiler._get_result(); });
 
+    using interpreter::intl::ProfilerPlugin;
+    py::class_<ProfilerPlugin, std::shared_ptr<ProfilerPlugin>>(m, "GraphProfiler2")
+        .def(py::init<cg::ComputingGraph*>());
+
     auto GraphOptimizeOptions = py::class_<_OptimizeForInferenceOptions>(m, "GraphOptimizeOptions")
         .def(py::init())
         .def("serialize", &_OptimizeForInferenceOptions::serialize)
diff --git a/imperative/src/impl/profiler_plugin.cpp b/imperative/src/impl/profiler_plugin.cpp
new file mode 100644
index 00000000..d4a462f7
--- /dev/null
+++ b/imperative/src/impl/profiler_plugin.cpp
@@ -0,0 +1,207 @@
+/**
+ * \file imperative/src/impl/profiler_plugin.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#include "megbrain/imperative/profiler_plugin.h"
+
+#include "megbrain/graph.h"
+#include "megbrain/graph/event.h"
+
+#include "./profiler/events.h"
+
+namespace mgb::imperative::interpreter::intl {
+
+ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) {
+    using namespace cg;
+    using namespace cg::event;
+    using namespace profiler;
+    auto on_seq_start = [this](CompSeqExecBeforeStart const& event) {
+        // reset
+        mgb_assert(!event.graph->options().imperative_proxy_graph);
+        if (m_opr_dict.empty() && m_var_dict.empty()) {
+            init_seq(event.exec);
+        }
+        Profiler::record<ScopeEvent>("DispatchOprs");
+        event.exec->iter_opr_seq([this](OperatorNodeBase* opr) -> bool{
+            auto& opr_info = get_opr_info(opr);
+            SmallVector<uint64_t> inputs;
+            for (auto input: opr->input()) {
+                inputs.push_back(get_var_info(input).id);
+            }
+            SmallVector<uint64_t> outputs;
+            for (auto output: opr->output()) {
+                outputs.push_back(get_var_info(output).id);
+            }
+            auto opr_name = opr->dyn_typeinfo()->name;
+            auto copy_params = [params = opr_info.params] { return *params; };
+            Profiler::record<OpDispatchEvent>(opr_info.id, opr_name, copy_params, inputs, outputs);
+            for (auto output: opr->output()) {
+                auto var_id = get_var_info(output).id;
+                Profiler::record<TensorDeclareEvent>(var_id);
+            }
+            return true;
+        });
+        Profiler::record<ScopeFinishEvent>("DispatchOprs");
+        Profiler::record<ScopeEvent>("Constants");
+        for (auto&& [var, var_info]: m_var_dict) {
+            if (var_info->is_const) {
+                bool valid = var->dev_tensor_valid();
+                auto layout = valid ? var->layout() : TensorLayout();
+                Profiler::record<TensorDeclareEvent>(var_info->id);
+                Profiler::record<TensorProduceEvent>(var_info->id, layout, var->comp_node(), valid ? var->dev_tensor().raw_ptr() : nullptr);
+            } else {
+                var_info->rt_ref_cnt = var_info->ref_cnt;
+            }
+        }
+        Profiler::record<ScopeFinishEvent>("Constants");
+    };
+    auto on_opr_start = [this](OprExecStart const& event) {
+        OperatorNodeBase* opr = event.opr;
+        auto& opr_info = get_opr_info(opr);
+        auto comp_node = opr_info.comp_node;
+        auto runner = [&opr_info] {
+            Profiler::record<OpExecuteEvent>(opr_info.id);
+        };
+        event.env->dispatch_on_comp_node(comp_node, runner);
+        auto inputs = opr->input();
+        for (auto&& input: inputs) {
+            auto& var_info = get_var_info(input);
+            auto runner = [&var_info, input] {
+                auto inp_id = var_info.id;
+                Profiler::record<OpInputEvent>(inp_id, input->shape());
+                Profiler::record<TensorUsageEvent>(inp_id);
+                Profiler::record<OpInputFinishEvent>(inp_id, input->shape());
+            };
+            event.env->dispatch_on_comp_node(comp_node, runner);
+        }
+    };
+    auto on_opr_finish = [this](OprExecKernelEnd const& event) {
+        OperatorNodeBase* opr = event.opr;
+        auto& opr_info = get_opr_info(opr);
+        auto comp_node = opr_info.comp_node;
+        auto inputs = opr->input();
+        auto outputs = opr->output();
+        for (auto input: inputs) {
+            auto& var_info = get_var_info(input);
+            auto runner = [&var_info] {
+                if (!var_info.is_const) {
+                    if (--var_info.rt_ref_cnt == 0) {
+                        Profiler::record<TensorReleaseEvent>(var_info.id);
+                    }
+                }
+            };
+            event.env->dispatch_on_comp_node(comp_node, runner);
+        }
+        for (auto output: outputs) {
+            auto& var_info = get_var_info(output);
+            mgb_assert(comp_node == output->comp_node(), "opr comp_node mismatch");
+            auto runner = [&var_info, output] {
+                auto out_id = var_info.id;
+                bool valid = output->dev_tensor_valid();
+                auto layout = valid ? output->layout() : TensorLayout();
+                Profiler::record<OpOutputEvent>(out_id, output->shape());
+                Profiler::record<TensorProduceEvent>(out_id, layout, output->comp_node(), valid ? output->dev_tensor().raw_ptr() : nullptr);
+                if (!var_info.ref_cnt) {
+                    Profiler::record<TensorReleaseEvent>(var_info.id);
+                }
+                Profiler::record<OpOutputFinishEvent>(out_id, output->shape());
+            };
+            event.env->dispatch_on_comp_node(comp_node, runner);
+        }
+        auto runner = [&opr_info]() {
+            Profiler::record<OpExecuteFinishEvent>(opr_info.id);
+        };
+        event.env->dispatch_on_comp_node(comp_node, runner);
+    };
+    auto on_before_kern = [this](BeforeKernel const& event) {
+        OperatorNodeBase* opr = event.opr;
+        Profiler::record<KernelExecuteEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node));
+    };
+    auto on_after_kern = [this](AfterKernel const& event) {
+        OperatorNodeBase* opr = event.opr;
+        Profiler::record<KernelExecuteFinishEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node));
+    };
+    auto on_graph_compile = [this](const CompSeqOrderDetermined&) {
+        m_opr_dict.clear();
+        m_var_dict.clear();
+    };
+    auto on_seq_finish = [this](CompSeqExecFinished const& event) {
+        for (auto&& [var, var_info]: m_var_dict) {
+            MGB_MARK_USED_VAR(var);
+            if (var_info->is_const) {
+                Profiler::record<TensorReleaseEvent>(var_info->id);
+            }
+            Profiler::record<TensorEraseEvent>(var_info->id, var_info->ref_cnt);
+        }
+    };
+    add_event_handler(graph->event().register_receiver<CompSeqExecBeforeStart>(on_seq_start));
+    add_event_handler(graph->event().register_receiver<OprExecStart>(on_opr_start));
+    add_event_handler(graph->event().register_receiver<OprExecKernelEnd>(on_opr_finish));
+    add_event_handler(graph->event().register_receiver<BeforeKernel>(on_before_kern));
+    add_event_handler(graph->event().register_receiver<AfterKernel>(on_after_kern));
+    add_event_handler(graph->event().register_receiver<CompSeqOrderDetermined>(on_graph_compile));
+    add_event_handler(graph->event().register_receiver<CompSeqExecFinished>(on_seq_finish));
+}
+
+void ProfilerPlugin::init_seq(cg::AsyncExecutable *comp_seq) {
+    mgb_assert(m_opr_dict.empty());
+    mgb_assert(m_var_dict.empty());
+    comp_seq->iter_opr_seq([this](cg::OperatorNodeBase* opr){
+        auto comp_nodes = get_opr_comp_node_set(opr);
+        mgb_assert(comp_nodes.size() == 1);
+        register_opr(opr);
+        for (auto&& input: opr->input()) {
+            if (m_var_dict.count(input) == 0) {
+                register_var(input).is_const = true;
+            } else {
+                get_var_info(input).ref_cnt++;
+            }
+        }
+        for (auto&& output: opr->output()) {
+            register_var(output).is_const = false;
+        }
+        //TODO: check ref_cnt
+        return true;
+    });
+}
+
+ProfilerPlugin::OprInfo& ProfilerPlugin::register_opr(cg::OperatorNodeBase *opr) {
+    OprInfo info;
+    info.id = Profiler::next_id();
+    auto params = std::make_shared<std::unordered_map<std::string, std::string>>();
+    auto params_json = opr->to_json();
+    for (auto&& [k, v]: params_json->cast_final<json::Object>().get_impl()) {
+        params->insert({k.get_impl(), v->to_string()});
+    }
+    info.params = std::move(params);
+    auto comp_nodes = cg::get_opr_comp_node_set(opr);
+    mgb_assert(comp_nodes.size() == 1, "only support single comp_node opr");
+    info.comp_node = *comp_nodes.begin();
+    return m_opr_dict.insert({opr, info}).first->second;
+}
+
+ProfilerPlugin::VarInfo& ProfilerPlugin::register_var(cg::VarNode *var) {
+    auto info = std::make_unique<VarInfo>();
+    info->id = Profiler::next_id();
+    info->is_const = false;
+    info->ref_cnt = 0;
+    info->rt_ref_cnt = 0;
+    return *m_var_dict.insert({var, std::move(info)}).first->second;
+}
+
+ProfilerPlugin::OprInfo& ProfilerPlugin::get_opr_info(cg::OperatorNodeBase *opr) {
+    return m_opr_dict.at(opr);
+}
+
+ProfilerPlugin::VarInfo& ProfilerPlugin::get_var_info(cg::VarNode *var) {
+    return *m_var_dict.at(var);
+}
+
+}
diff --git a/imperative/src/include/megbrain/imperative/profiler_plugin.h b/imperative/src/include/megbrain/imperative/profiler_plugin.h
new file mode 100644
index 00000000..5ee28fe0
--- /dev/null
+++ b/imperative/src/include/megbrain/imperative/profiler_plugin.h
@@ -0,0 +1,46 @@
+/**
+ * \file imperative/src/impl/interpreter/profiler.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#pragma once
+
+#include "megbrain/plugin/base.h"
+
+#include "megbrain/imperative/profiler.h"
+
+namespace mgb::imperative::interpreter::intl {
+
+class ProfilerPlugin: public PluginBase {
+public:
+    struct OprInfo {
+        uint64_t id;
+        CompNode comp_node;
+        std::shared_ptr<std::unordered_map<std::string, std::string>> params;
+    };
+
+    struct VarInfo {
+        uint64_t id;
+        bool is_const;
+        size_t ref_cnt;
+        std::atomic_size_t rt_ref_cnt;
+    };
+private:
+    std::unordered_map<cg::OperatorNodeBase*, OprInfo> m_opr_dict;
+    std::unordered_map<cg::VarNode*, std::unique_ptr<VarInfo>> m_var_dict;
+public:
+    explicit ProfilerPlugin(cg::ComputingGraph* graph);
+    void init_seq(cg::AsyncExecutable* comp_seq);
+    OprInfo& register_opr(cg::OperatorNodeBase* opr);
+    VarInfo& register_var(cg::VarNode* var);
+    OprInfo& get_opr_info(cg::OperatorNodeBase* opr);
+    VarInfo& get_var_info(cg::VarNode* var);
+};
+
+}