Browse Source

fix(profiler): fix profiler in dtr

GitOrigin-RevId: 79b5a6b526
release-1.6
Megvii Engine Team 3 years ago
parent
commit
defbc20efb
5 changed files with 46 additions and 24 deletions
  1. +11
    -9
      imperative/src/impl/interpreter/interpreter_impl.cpp
  2. +3
    -2
      imperative/src/impl/interpreter/interpreter_impl.h
  3. +2
    -4
      imperative/src/impl/profiler/chrome_timeline.cpp
  4. +1
    -0
      imperative/src/impl/profiler/events.h
  5. +29
    -9
      imperative/src/impl/profiler/states.h

+ 11
- 9
imperative/src/impl/interpreter/interpreter_impl.cpp View File

@@ -616,7 +616,7 @@ void ChannelImpl::release_tensor(TensorInfo* dest) {
void ChannelImpl::regenerate(TensorInfo* dest) { void ChannelImpl::regenerate(TensorInfo* dest) {
if (dest->evict_type == EvictType::DROP) { if (dest->evict_type == EvictType::DROP) {
auto &&path = dest->producer; auto &&path = dest->producer;
m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest});
m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest, "dtr"});
if (!m_applying) flush_apply_stack(); if (!m_applying) flush_apply_stack();
} else if (dest->evict_type == EvictType::SWAP) { } else if (dest->evict_type == EvictType::SWAP) {
MGB_RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen); MGB_RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen);
@@ -625,7 +625,7 @@ void ChannelImpl::regenerate(TensorInfo* dest) {
} }
} }


void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) {
using namespace ranges; using namespace ranges;
using namespace ranges::views; using namespace ranges::views;
auto& state = get_worker_state(); auto& state = get_worker_state();
@@ -689,7 +689,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
} }
return outputs; return outputs;
}; };
MGB_RECORD_EVENT(OpExecuteEvent, apply_id);
MGB_RECORD_EVENT(OpExecuteEvent, apply_id, {}, reason);
// Begin profiling operator // Begin profiling operator
SmallVector<std::pair<CompNode, uint64_t>> kernels; SmallVector<std::pair<CompNode, uint64_t>> kernels;
if (profiling_device) { if (profiling_device) {
@@ -769,7 +769,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
} }
m_dtr.unpin(cmd.inputs); m_dtr.unpin(cmd.inputs);
} }
MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id);
MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id, {}, reason);
// End profiling operator // End profiling operator
} }


@@ -777,7 +777,7 @@ void ChannelImpl::flush_apply_stack() {
m_applying = true; m_applying = true;
auto& state = get_worker_state(); auto& state = get_worker_state();
while (!m_apply_stack.empty()) { while (!m_apply_stack.empty()) {
auto& [cmd, idx, recomp] = m_apply_stack.top(); // cmd.inputs[0~idx-1] is in memory
auto& [cmd, idx, recomp, reason] = m_apply_stack.top(); // cmd.inputs[0~idx-1] is in memory
if (idx == 0) { if (idx == 0) {
if (state.options.enable_dtr_auto_drop) { if (state.options.enable_dtr_auto_drop) {
m_dtr.pin(cmd.inputs); m_dtr.pin(cmd.inputs);
@@ -801,10 +801,9 @@ void ChannelImpl::flush_apply_stack() {
} }
if (regen) continue; if (regen) continue;
// the required input tensors are already in memory // the required input tensors are already in memory
auto cmd_backup = cmd;
auto recomp_backup = recomp;
auto [cmd_backup, recomp_backup, reason_backup] = std::make_tuple(cmd, recomp, reason);
m_apply_stack.pop(); m_apply_stack.pop();
do_apply_op(cmd_backup);
do_apply_op(cmd_backup, reason_backup);
if (recomp_backup) { if (recomp_backup) {
MGB_RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen); MGB_RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen);
for (auto o : cmd_backup.outputs) { for (auto o : cmd_backup.outputs) {
@@ -829,6 +828,7 @@ bool ChannelImpl::auto_evict(size_t force_num) {
sample_on_device(m_dtr.comp_node, false); sample_on_device(m_dtr.comp_node, false);
auto best = m_dtr.find_best_tensor(state.options.enable_dtr_sqrt_sampling && !force_num); auto best = m_dtr.find_best_tensor(state.options.enable_dtr_sqrt_sampling && !force_num);
if (!best) { if (!best) {
MGB_RECORD_EVENT(AutoEvictFinishEvent);
break; break;
} }
if (best->ptr.unique() && best->ptr->blob().unique()) { if (best->ptr.unique() && best->ptr->blob().unique()) {
@@ -947,7 +947,9 @@ void ChannelImpl::alloc_tensor_with_evict(Blob* x) {
set_log_level(pre_level); set_log_level(pre_level);
mgb_log_warn("reallocating all cuda memory to alleviate fragmentation, the performance may be affected"); mgb_log_warn("reallocating all cuda memory to alleviate fragmentation, the performance may be affected");
set_log_level(LogLevel::NO_LOG); set_log_level(LogLevel::NO_LOG);
imperative_log_profile_begin("defrag");
BlobManager::inst()->defrag(x->comp_node()); BlobManager::inst()->defrag(x->comp_node());
imperative_log_profile_end("defrag");
BlobManager::inst()->alloc_direct(x, x->size()); BlobManager::inst()->alloc_direct(x, x->size());
} }
}); });
@@ -1025,7 +1027,7 @@ void ChannelImpl::process_one_task(Command& icmd) {
return; return;
} }
} }
m_apply_stack.push({cmd, 0, nullptr});
m_apply_stack.push({cmd, 0, nullptr, "cmd"});
flush_apply_stack(); flush_apply_stack();
for (size_t i = 0; i < cmd.outputs.size(); ++i) { for (size_t i = 0; i < cmd.outputs.size(); ++i) {
auto output = cmd.outputs[i]; auto output = cmd.outputs[i];


+ 3
- 2
imperative/src/impl/interpreter/interpreter_impl.h View File

@@ -104,8 +104,8 @@ private:
void release_tensor(TensorInfo* dest); void release_tensor(TensorInfo* dest);


void regenerate(TensorInfo* dest); void regenerate(TensorInfo* dest);
void do_apply_op(const ApplyOp& cmd);
void flush_apply_stack(); void flush_apply_stack();
void do_apply_op(const ApplyOp& cmd, std::string reason);
std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPtr>> init_output_and_workspace( std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPtr>> init_output_and_workspace(
const OpDef& def, const OpDef& def,
@@ -150,7 +150,8 @@ private:
std::exception_ptr m_worker_exc; std::exception_ptr m_worker_exc;
std::function<void(std::string, std::string)> m_profile_dump_callback; std::function<void(std::string, std::string)> m_profile_dump_callback;
size_t m_storage_id = 0; size_t m_storage_id = 0;
std::stack<std::tuple<ApplyOp, size_t, TensorInfo*>> m_apply_stack;
// TODO: use explicit struct
std::stack<std::tuple<ApplyOp, size_t, TensorInfo*, std::string>> m_apply_stack;
bool m_applying = false; bool m_applying = false;
bool m_closed = false; bool m_closed = false;




+ 2
- 4
imperative/src/impl/profiler/chrome_timeline.cpp View File

@@ -200,13 +200,13 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {


ChromeTraceEvent& new_host_event(std::string name, char ph) { ChromeTraceEvent& new_host_event(std::string name, char ph) {
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time)); return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time));
};
}


ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) {
using namespace std::literals::chrono_literals; using namespace std::literals::chrono_literals;
auto time = since_start(to_device_time(current->time, device)); auto time = since_start(to_device_time(current->time, device));
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time); return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time);
};
}


const char* to_cstr(TensorCommandKind kind) { const char* to_cstr(TensorCommandKind kind) {
switch(kind) { switch(kind) {
@@ -241,14 +241,12 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
new_host_event("OpDispatch", 'E').args(current_op->detail()); new_host_event("OpDispatch", 'E').args(current_op->detail());
} else if constexpr (std::is_same_v<TEvent, OpExecuteEvent>) { } else if constexpr (std::is_same_v<TEvent, OpExecuteEvent>) {
mgb_assert(event.op_id != 0); mgb_assert(event.op_id != 0);
current_op->execute_begin = current->time;
new_host_event(current_op->name, 'B'); new_host_event(current_op->name, 'B');
new_host_event(pid_str, 't') new_host_event(pid_str, 't')
.cat("OpDispatch") .cat("OpDispatch")
.id(current_op->id) .id(current_op->id)
.scope(pid_str); .scope(pid_str);
} else if constexpr (std::is_same_v<TEvent, OpExecuteFinishEvent>) { } else if constexpr (std::is_same_v<TEvent, OpExecuteFinishEvent>) {
current_op->execute_end = current->time;
new_host_event(current_op->name, 'E') new_host_event(current_op->name, 'E')
.args(current_op->detail()); .args(current_op->detail());
} else if constexpr (std::is_same_v<TEvent, KernelLaunchEvent>) { } else if constexpr (std::is_same_v<TEvent, KernelLaunchEvent>) {


+ 1
- 0
imperative/src/impl/profiler/events.h View File

@@ -84,6 +84,7 @@ DEF_DUR_EVENT(OpOutput, {
DEF_DUR_EVENT(OpExecute, { DEF_DUR_EVENT(OpExecute, {
uint64_t op_id; uint64_t op_id;
SmallVector<CompNode> device_list; SmallVector<CompNode> device_list;
std::string reason;
}); });


DEF_DUR_EVENT(KernelLaunch, { DEF_DUR_EVENT(KernelLaunch, {


+ 29
- 9
imperative/src/impl/profiler/states.h View File

@@ -62,8 +62,13 @@ struct ProfileOperatorState {
CompNode device; CompNode device;
Trace trace; Trace trace;


profiler::HostTime execute_begin;
profiler::HostTime execute_end;
struct Execution {
std::string reason;
profiler::HostTime begin;
profiler::HostTime end;
};

SmallVector<Execution> executions;


nlohmann::json detail() { nlohmann::json detail() {
nlohmann::json args; nlohmann::json args;
@@ -285,7 +290,7 @@ public:
op.outputs = event.outputs; op.outputs = event.outputs;
op.trace = event.trace; op.trace = event.trace;
for (auto&& output: event.outputs) { for (auto&& output: event.outputs) {
m_tensors.at(output).source = op.id;
m_tensors[output].source = op.id;
} }
} else if constexpr (std::is_same_v<T, TensorDeclareEvent>) { } else if constexpr (std::is_same_v<T, TensorDeclareEvent>) {
auto& tensor = m_tensors[event.tensor_id]; auto& tensor = m_tensors[event.tensor_id];
@@ -293,7 +298,7 @@ public:
tensor.id = event.tensor_id; tensor.id = event.tensor_id;
tensor.name = event.name; tensor.name = event.name;
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) { } else if constexpr (std::is_same_v<T, TensorProduceEvent>) {
auto& tensor = m_tensors.at(event.tensor_id);
auto& tensor = m_tensors[event.tensor_id];
if (!m_device_tid_table.count(event.device)) { if (!m_device_tid_table.count(event.device)) {
m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()};
} }
@@ -308,15 +313,24 @@ public:
using T = std::decay_t<decltype(event)>; using T = std::decay_t<decltype(event)>;
// update current_op/tensor // update current_op/tensor
if constexpr (is_op_event<T>::value) { if constexpr (is_op_event<T>::value) {
current_op = &m_operators.at(event.op_id);
current_op = &m_operators[event.op_id];
if (current_op->id == 0) {
current_op->id = event.op_id;
current_op->name = "UnknownOperator";
}
} else if constexpr (is_tensor_event<T>::value) { } else if constexpr (is_tensor_event<T>::value) {
mgb_assert(m_tensors.count(event.tensor_id) != 0, "tensor not found");
current_tensor = &m_tensors.at(event.tensor_id);
current_tensor = &m_tensors[event.tensor_id];
if (current_tensor->id == 0) {
current_tensor->id = event.tensor_id;
current_tensor->name = "UnknownTensor";
}
} }
if constexpr (std::is_same_v<T, OpExecuteEvent>) { if constexpr (std::is_same_v<T, OpExecuteEvent>) {
current_op->execute_begin = current->time;
current_op->executions.emplace_back();
current_op->executions.back().reason = event.reason;
current_op->executions.back().begin = current->time;
} else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) { } else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) {
current_op->execute_end = current->time;
current_op->executions.back().end = current->time;
} }
// update counters // update counters
if constexpr (std::is_same_v<T, OpDispatchEvent>) { if constexpr (std::is_same_v<T, OpDispatchEvent>) {
@@ -337,6 +351,12 @@ public:
} }
} else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) { } else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) {
inc_counter("nr_exception", 1); inc_counter("nr_exception", 1);
} else if constexpr (std::is_same_v<T, KernelLaunchFinishEvent>) {
auto& execution = current_op->executions.back();
if (execution.reason == "dtr") {
auto overhead = to_device_time(current->time, event.device) - to_device_time(execution.begin, event.device);
inc_counter("dtr_overhead_us", std::chrono::duration_cast<std::chrono::microseconds>(overhead).count());
}
} }
// visit_event_impl // visit_event_impl
self.visit_event(event); self.visit_event(event);


Loading…
Cancel
Save