OpenI
/
MegEngine

/**
 * \file imperative/src/impl/profiler.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#include "megbrain/imperative/profiler.h"

#include "./function_hook.h"
#include "megbrain/imperative/ops/opr_attr.h"
#include "megbrain/imperative/physical_tensor.h"

#include "megbrain/plugin/opr_footprint.h"

#include "./event_pool.h"
#include "./op_trait.h"

namespace mgb {
namespace imperative {

namespace {

CompNode::UnorderedSet collect_comp_nodes(
        const OpDef& def, const SmallVector<TensorPtr>& inputs) {
    CompNode::UnorderedSet comp_nodes;
    for (auto&& input : inputs) {
        comp_nodes.insert(input->comp_node());
    }
    for (auto&& output_attr : def.infer_output_attrs(def, inputs)) {
        comp_nodes.insert(output_attr.comp_node);
    }
    return comp_nodes;
}

DeviceTimer::SharedEvent alloc_recorded_event(CompNode device) {
    auto event = EventPool::with_timer().alloc_shared(device);
    event->record();
    return event;
}

OprFootprint footprint{};

}  // namespace

void DeviceTimer::reset(thin_function<double()> host_timer) {
    CompNode::foreach ([this, host_timer](CompNode device) {
        m_base_event_table[device] = {alloc_recorded_event(device), host_timer()};
    });
    m_host_timer = host_timer;
}

thin_function<double()> DeviceTimer::get_device_time(CompNode device) {
    auto event = EventPool::with_timer().alloc_shared(device);
    event->record();
    if(m_base_event_table.count(device) == 0) {
        m_base_event_table[device] = {alloc_recorded_event(device), m_host_timer()};
    }
    auto base = m_base_event_table[device];
    return [base, event] {
        auto [base_event, host_time] = base;
        // TODO: sync once for each compnode
        event->host_wait();
        return base_event->elapsed_time_until(*event) * 1000 + host_time;
    };
}

void DeviceTimer::clear() {
    m_base_event_table.clear();
}

size_t TensorRecorder::record_tensor(const TensorPtr& tensor) {
    if (m_tensor_map.count(tensor.get()) > 0) {
        auto& [prev, id] = m_tensor_map[tensor.get()];
        if (prev.lock() != tensor) {
            prev = tensor;
            id = m_next_id++;
        }
        return id;
    } else {
        auto id = m_next_id++;
        m_tensor_map.insert({tensor.get(), {std::weak_ptr{tensor}, id}});
        return id;
    }
}

void TensorRecorder::clear() {
    m_next_id = 0;
    m_tensor_map.clear();
}

Profile& Profiler::get_profile() {
    for (auto& entry : m_profile) {
        for (auto& [device, device_begin, device_end] : entry.device_list) {
            MGB_MARK_USED_VAR(device);
            device_begin = [value = device_begin()] { return value; };
            device_end = [value = device_end()] { return value; };
        }
    }
    return m_profile;
}

void Profiler::start(uint32_t flags) {
    m_host_timer.reset();
    m_device_timer.reset([&] { return m_host_timer.get_msecs(); });
    OpTrait::for_each_trait([this, flags](OpTrait& trait) {
        auto hook_apply_on_physical_tensor =
                make_shared_hook(&trait.apply_on_physical_tensor);
        auto hook_apply_on_var_node =
                make_shared_hook(&trait.apply_on_var_node);
        hook_apply_on_physical_tensor->apply_hook([this, flags]
                (auto&& apply, const OpDef& def, const SmallVector<TensorPtr>& inputs) {
            auto shape2vector = [](const TensorShape& shape) {
                std::vector<size_t> vector_shape;
                for (size_t i = 0; i < shape.ndim; i++) {
                    vector_shape.push_back(shape[i]);
                }
                return vector_shape;
            };
            ProfileEntry entry;
            entry.id = m_entry_count++;
            // TODO: assign parent
            entry.parent = 0;
            // Record apply context and save to m_profile
            entry.op = def.copy();
            for (auto&& input : inputs) {
                entry.inputs.push_back({m_tensor_recorder.record_tensor(input),
                                        shape2vector(input->layout()),
                                        input->comp_node()});
            }
            double host_begin = m_host_timer.get_msecs();
            auto&& comp_nodes = collect_comp_nodes(def, inputs);
            for (auto&& comp_node : comp_nodes) {
                entry.device_list.push_back(
                        {comp_node,
                         m_device_timer.get_device_time(comp_node),
                         {}});
            }
            if (flags & PROFILE_FOOTPRINT) {
                MGB_LOCK_GUARD(m_lock);
                m_entry_stack.push({&def, &entry, std::this_thread::get_id()});
            }
            // Do real apply
            auto outputs = apply(def, inputs);
            for (auto& [cn, dev_begin, dev_end] : entry.device_list) {
                MGB_MARK_USED_VAR(cn);
                MGB_MARK_USED_VAR(dev_begin);
                dev_end = m_device_timer.get_device_time(cn);
            }
            entry.host = {host_begin, m_host_timer.get_msecs()};
            for (auto&& output : outputs) {
                entry.outputs.push_back(
                        {m_tensor_recorder.record_tensor(output),
                         shape2vector(output->layout()), output->comp_node()});
            }
            if (flags & PROFILE_FOOTPRINT) {
                mgb_assert(std::get<1>(m_entry_stack.top()) == &entry);
                MGB_LOCK_GUARD(m_lock);
                m_entry_stack.pop();
            }
            m_profile.push_back(std::move(entry));
            return outputs;
        });
        if (flags & PROFILE_FOOTPRINT) {
            hook_apply_on_var_node->apply_hook(
                    [this](auto&& apply, const OpDef& def,
                           VarNodeArray inputs) -> cg::OperatorNodeBase* {
                        auto* operator_node = apply(def, std::move(inputs));
                        std::remove_reference_t<decltype(m_entry_stack.top())>
                                top;
                        {
                            MGB_LOCK_GUARD(m_lock);
                            if (m_entry_stack.empty()) {
                                return operator_node;
                            }
                            top = m_entry_stack.top();
                        }
                        auto [current_op, current_entry, thread_id] = top;
                        if (current_op != &def ||
                            thread_id != std::this_thread::get_id()) {
                            return operator_node;
                        }
                        auto&& footprint_result =
                                footprint.calc_footprint(operator_node);
                        current_entry->memory = footprint_result.memory;
                        current_entry->computation =
                                footprint_result.computation;
#if MGB_ENABLE_JSON
                        current_entry->param = footprint_result.param;
#endif
                        return operator_node;
                    });
        }
        m_hooker_list.push_back(std::move(hook_apply_on_physical_tensor));
        m_hooker_list.push_back(std::move(hook_apply_on_var_node));
    });
}

void Profiler::stop() {
    m_hooker_list.clear();
    for (auto& entry : m_profile) {
        entry.wait_device();
    }
}

void Profiler::clear() {
    mgb_assert(m_entry_stack.empty(),
               "entry_stack should be empty after profile");
    mgb_assert(m_hooker_list.empty(), "hooks should be released");
    m_profile.clear();
    m_entry_count = 0;
    m_device_timer.clear();
    m_tensor_recorder.clear();
}

}  // namespace imperative

}  // namespace mgb