From 59d59766220987bf65d821d750907ae5ab57b2f8 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 1 Jul 2021 12:50:06 +0800 Subject: [PATCH] fix(profiler): collect records when thread exit GitOrigin-RevId: fb796dd61cf66de028e52a4e9dc8ba3484640449 --- imperative/python/megengine/tools/profiler.py | 11 +- .../src/impl/interpreter/interpreter_impl.cpp | 150 ++++++++++----------- imperative/src/impl/physical_tensor.cpp | 5 + imperative/src/impl/profiler.cpp | 10 +- imperative/src/impl/profiler/chrome_timeline.cpp | 34 +++-- imperative/src/impl/profiler/states.h | 4 +- imperative/src/impl/profiler_plugin.cpp | 38 +++--- .../src/include/megbrain/imperative/profiler.h | 23 ++-- 8 files changed, 148 insertions(+), 127 deletions(-) diff --git a/imperative/python/megengine/tools/profiler.py b/imperative/python/megengine/tools/profiler.py index bea9615f..a50c25ef 100755 --- a/imperative/python/megengine/tools/profiler.py +++ b/imperative/python/megengine/tools/profiler.py @@ -53,10 +53,6 @@ def main(): if args.format: prof_args["formats"] = args.format - if args.clean: - for file in os.listdir(profiler.directory): - os.remove(os.path.join(profiler.directory, file)) - if len(extras) == 0: if not args.merge_trace_events: parser.print_usage() @@ -74,9 +70,14 @@ def main(): sys.argv[:] = [filename, *extras[1:]] profiler = Profiler(**prof_args) + + if args.clean: + for file in os.listdir(profiler.directory): + os.remove(os.path.join(profiler.directory, file)) + with profiler: if args.module: - runpy.run_module(filename) + run_module(filename) else: run_script(filename) profiler.dump() diff --git a/imperative/src/impl/interpreter/interpreter_impl.cpp b/imperative/src/impl/interpreter/interpreter_impl.cpp index f1d0f21f..55b62b48 100644 --- a/imperative/src/impl/interpreter/interpreter_impl.cpp +++ b/imperative/src/impl/interpreter/interpreter_impl.cpp @@ -30,12 +30,6 @@ using namespace imperative; using namespace interpreter; using namespace interpreter::intl; -#define RECORD_EVENT(type, ...) \ - if (Profiler::is_profiling()) { \ - Profiler::record(type{__VA_ARGS__}); \ - } \ - - namespace { auto tinfo_to_tid(SmallVector tinfo) { SmallVector tid; @@ -70,12 +64,12 @@ namespace mgb { **/ SYMBOL_EXPORT void imperative_log_profile_begin(const char* message) { - RECORD_EVENT(CustomEvent, std::string{message}); + MGB_RECORD_EVENT(CustomEvent, std::string{message}); } SYMBOL_EXPORT void imperative_log_profile_end(const char* message) { - RECORD_EVENT(CustomFinishEvent, std::string{message}); + MGB_RECORD_EVENT(CustomFinishEvent, std::string{message}); } SYMBOL_EXPORT @@ -159,13 +153,13 @@ TensorInfo* ChannelImpl::put_impl(const DeviceTensorND& data, const HostTensorND auto& state = get_channel_state(); auto _ = StackManager::Guard{"Put", &state.stack_manager}; auto info = alloc(); - RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandKind::Put); + MGB_RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandKind::Put); init(info, {data.layout(), data.comp_node()}); info->mem_desc.id = StorageIdentifier::make(++m_storage_id); info->ptr = Tensor::make(data, hvalue); - RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr()); + MGB_RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr()); info->status = TensorInfo::Produced; - RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandKind::Put); + MGB_RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandKind::Put); return info; } @@ -231,7 +225,7 @@ void ChannelImpl::dispatch_default_cpu( auto _ = StackManager::Guard(name, &state.stack_manager); auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); - RECORD_EVENT(ShapeInferEvent, validated); + MGB_RECORD_EVENT(ShapeInferEvent, validated); SmallVector input_tensornds; input_tensornds.reserve(input_descs.size()); @@ -289,7 +283,7 @@ void ChannelImpl::dispatch_default_cpu( } return op_info; }; - RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, + MGB_RECORD_EVENT(OpDispatchEvent, op_id, name, op_info_getter, tinfo_to_tid(input_infos), tinfo_to_tid(output_infos), state.stack_manager.dump()); } @@ -306,7 +300,7 @@ void ChannelImpl::dispatch_kernel( auto _ = StackManager::Guard{name, &state.stack_manager}; auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); - RECORD_EVENT(ShapeInferEvent, validated); + MGB_RECORD_EVENT(ShapeInferEvent, validated); ApplyOp cmd{Profiler::next_id(), std::move(op)}; cmd.inputs = std::move(input_infos); @@ -332,7 +326,7 @@ void ChannelImpl::dispatch_kernel( } return op_info; }; - RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, + MGB_RECORD_EVENT(OpDispatchEvent, cmd.id, name, op_info_getter, tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs), state.stack_manager.dump()); m_buffer.enqueue(std::move(cmd)); @@ -426,7 +420,7 @@ DType ChannelImpl::get_dtype(Handle handle) { mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(), "invalid handle: %p", handle); auto info = reinterpret_cast(handle); - RECORD_EVENT(TensorGetPropEvent, info->id, TensorProp::DType); + MGB_RECORD_EVENT(TensorGetPropEvent, info->id, TensorProp::DType); auto ret = info->desc.layout.dtype; mgb_assert(ret.valid()); return ret; @@ -438,7 +432,7 @@ CompNode ChannelImpl::get_device(Handle handle) { mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(), "invalid handle: %p", handle); auto info = reinterpret_cast(handle); - RECORD_EVENT(TensorGetPropEvent, info->id, TensorProp::Device); + MGB_RECORD_EVENT(TensorGetPropEvent, info->id, TensorProp::Device); auto ret = info->desc.comp_node; mgb_assert(ret.valid()); return ret; @@ -512,7 +506,7 @@ TensorInfo* ChannelImpl::alloc() { void ChannelImpl::init(TensorInfo* info, LogicalTensorDesc desc) { m_valid_handle.insert(info); - RECORD_EVENT(TensorDeclareEvent, info->id, info->name); + MGB_RECORD_EVENT(TensorDeclareEvent, info->id, info->name); info->status = TensorInfo::Allocated; info->desc = std::move(desc); info->mem_desc.layout = info->desc.layout; @@ -554,7 +548,7 @@ void ChannelImpl::free(TensorInfo* ptr) { } void ChannelImpl::recursive_free(TensorInfo* ptr) { - RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandKind::RecFree); + MGB_RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandKind::RecFree); SmallVector inps; if (ptr->producer) { for (auto i : ptr->producer->inputs) { @@ -569,7 +563,7 @@ void ChannelImpl::recursive_free(TensorInfo* ptr) { recursive_free(i); } } - RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandKind::RecFree); + MGB_RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandKind::RecFree); } void ChannelImpl::real_free(TensorInfo* ptr) { @@ -581,9 +575,9 @@ void ChannelImpl::real_free(TensorInfo* ptr) { ptr->detach_producer(); bool has_value = ptr->ptr != nullptr; if (has_value) { - RECORD_EVENT(TensorReleaseEvent, ptr->id); + MGB_RECORD_EVENT(TensorReleaseEvent, ptr->id); } - RECORD_EVENT(TensorEraseEvent, ptr->id, ptr->ptr_use_count); + MGB_RECORD_EVENT(TensorEraseEvent, ptr->id, ptr->ptr_use_count); ptr->status = TensorInfo::Deleted; MGB_LOCK_GUARD(m_mutex); m_pool.free(ptr); @@ -599,7 +593,7 @@ void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr) { auto& state = get_worker_state(); MGB_LOCK_GUARD(m_mutex); m_dtr.update_used_time(dest); - RECORD_EVENT(TensorProduceEvent, dest->id, ptr->layout(), ptr->comp_node(), ptr->dev_tensor().raw_ptr()); + MGB_RECORD_EVENT(TensorProduceEvent, dest->id, ptr->layout(), ptr->comp_node(), ptr->dev_tensor().raw_ptr()); // update tensor desc for static infer dest->desc.layout = ptr->layout(); dest->desc.comp_node = ptr->comp_node(); @@ -614,7 +608,7 @@ void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr) { } void ChannelImpl::release_tensor(TensorInfo* dest) { - RECORD_EVENT(TensorReleaseEvent, dest->id); + MGB_RECORD_EVENT(TensorReleaseEvent, dest->id); MGB_LOCK_GUARD(m_mutex); dest->ptr.reset(); } @@ -625,9 +619,9 @@ void ChannelImpl::regenerate(TensorInfo* dest) { m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest}); if (!m_applying) flush_apply_stack(); } else if (dest->evict_type == EvictType::SWAP) { - RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen); + MGB_RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen); produce_tensor(dest, Tensor::make(dest->h_value)); - RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandKind::ReGen); + MGB_RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandKind::ReGen); } } @@ -695,7 +689,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { } return outputs; }; - RECORD_EVENT(OpExecuteEvent, apply_id); + MGB_RECORD_EVENT(OpExecuteEvent, apply_id); // Begin profiling operator SmallVector> kernels; if (profiling_device) { @@ -710,9 +704,9 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { } for (auto* input: cmd.inputs) { auto input_id = input->id; - RECORD_EVENT(OpInputEvent, input_id); - RECORD_EVENT(TensorUsageEvent, input_id); - RECORD_EVENT(OpInputFinishEvent, input_id); + MGB_RECORD_EVENT(OpInputEvent, input_id); + MGB_RECORD_EVENT(TensorUsageEvent, input_id); + MGB_RECORD_EVENT(OpInputFinishEvent, input_id); } // Fused by command buffer. @see: CommandBuffer::fuse_del // Now if dest is inplacable, it's refcnt would be decreased to 1 and owned by tensor_inputs after Del. @@ -721,40 +715,40 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { // refcnt --, owners: [tensor_inputs] // if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor uint64_t del_id = del->id; - RECORD_EVENT(TensorCommandEvent, del_id, TensorCommandKind::Del); + MGB_RECORD_EVENT(TensorCommandEvent, del_id, TensorCommandKind::Del); free(del); - RECORD_EVENT(TensorCommandFinishEvent, del_id, TensorCommandKind::Del); + MGB_RECORD_EVENT(TensorCommandFinishEvent, del_id, TensorCommandKind::Del); } // Before wait //TODO: split operator wait and execute so that OpWait could be corrected recorded. // Before execute for (auto&& [device, kernel_id]: kernels) { - RECORD_EVENT(KernelLaunchEvent, apply_id, kernel_id, device); - RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); + MGB_RECORD_EVENT(KernelLaunchEvent, apply_id, kernel_id, device); + MGB_RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); } // Apply op // Here std::move is REQUIRED for removing duplicated references. auto outputs = apply_on_physical_tensor(apply_on_physical_tensor, *cmd.op, inputs); // After execute for (auto&& [device, kernel_id]: kernels) { - RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); - RECORD_EVENT(KernelLaunchFinishEvent, apply_id, kernel_id, device); + MGB_RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); + MGB_RECORD_EVENT(KernelLaunchFinishEvent, apply_id, kernel_id, device); } // End profiling operator mgb_assert(outputs.size() == cmd.outputs.size()); for (size_t i = 0; i < outputs.size(); ++i) { auto output = cmd.outputs[i]; if (output == nullptr) { - RECORD_EVENT(OpOutputEvent, 0); - RECORD_EVENT(OpOutputFinishEvent, 0); + MGB_RECORD_EVENT(OpOutputEvent, 0); + MGB_RECORD_EVENT(OpOutputFinishEvent, 0); } else if (output->ptr != nullptr) { - RECORD_EVENT(OpOutputEvent, output->id); - RECORD_EVENT(OpOutputFinishEvent, output->id); + MGB_RECORD_EVENT(OpOutputEvent, output->id); + MGB_RECORD_EVENT(OpOutputFinishEvent, output->id); } else { - RECORD_EVENT(OpOutputEvent, output->id); + MGB_RECORD_EVENT(OpOutputEvent, output->id); produce_tensor(output, outputs[i].tensor); output->mem_desc = outputs[i].desc; - RECORD_EVENT(OpOutputFinishEvent, output->id); + MGB_RECORD_EVENT(OpOutputFinishEvent, output->id); sample_on_device(output->desc.comp_node, false); } } @@ -775,7 +769,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { } m_dtr.unpin(cmd.inputs); } - RECORD_EVENT(OpExecuteFinishEvent, apply_id); + MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id); // End profiling operator } @@ -789,7 +783,7 @@ void ChannelImpl::flush_apply_stack() { m_dtr.pin(cmd.inputs); } if (recomp) { - RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandKind::ReGen); + MGB_RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandKind::ReGen); } } bool regen = false; @@ -812,7 +806,7 @@ void ChannelImpl::flush_apply_stack() { m_apply_stack.pop(); do_apply_op(cmd_backup); if (recomp_backup) { - RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen); + MGB_RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen); for (auto o : cmd_backup.outputs) { if (o) { m_dtr.update_dsu_after_recompute(o); @@ -831,7 +825,7 @@ bool ChannelImpl::auto_evict(size_t force_num) { size_t current_memory = m_dtr.comp_node.get_used_memory(); size_t flag = false; while ((state.options.dtr_eviction_threshold > 0 && current_memory > state.options.dtr_eviction_threshold) || force_num > 0) { - RECORD_EVENT(AutoEvictEvent); + MGB_RECORD_EVENT(AutoEvictEvent); sample_on_device(m_dtr.comp_node, false); auto best = m_dtr.find_best_tensor(state.options.enable_dtr_sqrt_sampling && !force_num); if (!best) { @@ -849,7 +843,7 @@ bool ChannelImpl::auto_evict(size_t force_num) { m_dtr.update_dsu_after_evict(best); } sample_on_device(m_dtr.comp_node, false); - RECORD_EVENT(AutoEvictFinishEvent); + MGB_RECORD_EVENT(AutoEvictFinishEvent); } return flag; } @@ -888,7 +882,7 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) { mgb_assert(!m_waitee, "duplicate waitee"); m_waitee = info; m_waitee_id = Profiler::next_id(); - RECORD_EVENT(TensorWaitPropEvent, info->id, m_waitee_id, prop); + MGB_RECORD_EVENT(TensorWaitPropEvent, info->id, m_waitee_id, prop); bool require_host = prop == TensorProp::HostValue; auto host_available = [&]{ return info->ptr && info->ptr->value_fetched(); @@ -904,14 +898,14 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) { check_worker_exc_unsafe(); return require_host ? host_available() : static_cast(info->ptr); }); - RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop); + MGB_RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop); m_waitee = nullptr; return info->ptr; } void ChannelImpl::notify_tensor_unsafe(TensorInfo* info) { if (info == m_waitee) { - RECORD_EVENT(TensorNotifyPropEvent, info->id); + MGB_RECORD_EVENT(TensorNotifyPropEvent, info->id); m_cv.notify_all(); } } @@ -1014,12 +1008,12 @@ void ChannelImpl::process_one_task(Command& icmd) { auto cmd_visitor = [&](const auto& cmd) { using T = std::decay_t; if constexpr (std::is_same_v) { - RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Put); - RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); + MGB_RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Put); + MGB_RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); auto value = cmd.no_cache ? std::make_shared(cmd.value) : Tensor::make(cmd.value); - RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); + MGB_RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); produce_tensor(cmd.dest, std::move(value)); - RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Put); + MGB_RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Put); sample_on_device(cmd.dest->desc.comp_node, false); } else if constexpr (std::is_same_v) { for (auto& i : cmd.inputs) { @@ -1088,11 +1082,11 @@ void ChannelImpl::process_one_task(Command& icmd) { } } } else if constexpr (std::is_same_v) { - RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Del); + MGB_RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Del); CompNode device = cmd.dest->desc.comp_node; uint64_t tensor_id = cmd.dest->id; free(cmd.dest); - RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandKind::Del); + MGB_RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandKind::Del); sample_on_device(device, false); } else if constexpr (std::is_same_v) { if (cmd.dest->invalid) return; @@ -1106,64 +1100,64 @@ void ChannelImpl::process_one_task(Command& icmd) { imperative_log_profile_end("GetValue"); } else if constexpr (std::is_same_v) { if (cmd.dest->invalid) return; - RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapIn); + MGB_RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapIn); produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value)); - RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapIn); + MGB_RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapIn); sample_on_device(cmd.dest->desc.comp_node, false); } else if constexpr (std::is_same_v) { if (cmd.dest->invalid) return; - RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapOut); + MGB_RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapOut); cmd.dest->h_value = cmd.dest->ptr->get_value(); if (cmd.dest->evict_type == EvictType::NONE) { cmd.dest->evict_type = EvictType::SWAP; cmd.dest->status = TensorInfo::Swapped; release_tensor(cmd.dest); } - RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapOut); + MGB_RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapOut); sample_on_device(cmd.dest->desc.comp_node, false); } else if constexpr (std::is_same_v) { if (cmd.dest->invalid) return; - RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Drop); + MGB_RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Drop); do_drop(cmd.dest, true); - RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Drop); + MGB_RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Drop); } else if constexpr (std::is_same_v) { options.set_option(cmd.key, cmd.value); } else if constexpr (std::is_same_v) { - RECORD_EVENT(StartProfileEvent); + MGB_RECORD_EVENT(StartProfileEvent); CompNode::sync_all(); for (auto* info: cmd.capture_tensors) { - RECORD_EVENT(TensorDeclareEvent, info->id, info->name); + MGB_RECORD_EVENT(TensorDeclareEvent, info->id, info->name); if (info->status == TensorInfo::Produced) { // TODO: handle swap/drop - RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, info->ptr->dev_tensor().raw_ptr()); + MGB_RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, info->ptr->dev_tensor().raw_ptr()); } } CompNode::foreach([&](CompNode device){ if (Profiler::get_option("sample_rate", 0)) { sample_on_device(device, true); } - RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); + MGB_RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); }); - RECORD_EVENT(StartProfileFinishEvent); + MGB_RECORD_EVENT(StartProfileFinishEvent); } else if constexpr (std::is_same_v) { - RECORD_EVENT(StopProfileEvent); + MGB_RECORD_EVENT(StopProfileEvent); for (auto* info: cmd.escape_tensors) { bool has_value = info->status == TensorInfo::Produced; if (has_value) { - RECORD_EVENT(TensorReleaseEvent, info->id); + MGB_RECORD_EVENT(TensorReleaseEvent, info->id); } - RECORD_EVENT(TensorEraseEvent, info->id); + MGB_RECORD_EVENT(TensorEraseEvent, info->id); } CompNode::foreach([&](CompNode device){ if (Profiler::get_option("sample_rate", 0)) { sample_on_device(device, true); } }); - RECORD_EVENT(StopProfileFinishEvent); + MGB_RECORD_EVENT(StopProfileFinishEvent); } else if constexpr (std::is_same_v) { - RECORD_EVENT(ScopeEvent, cmd.scope_name); + MGB_RECORD_EVENT(ScopeEvent, cmd.scope_name); } else if constexpr (std::is_same_v) { - RECORD_EVENT(ScopeFinishEvent, cmd.scope_name); + MGB_RECORD_EVENT(ScopeFinishEvent, cmd.scope_name); } else { static_assert(!std::is_same_v); } @@ -1186,7 +1180,7 @@ void ChannelImpl::process_one_task(Command& icmd) { cmd.dest->invalid = true; } m_worker_exc = std::current_exception(); - RECORD_EVENT(WorkerExceptionEvent); + MGB_RECORD_EVENT(WorkerExceptionEvent); if (m_waitee) { notify_tensor_unsafe(m_waitee); } @@ -1347,7 +1341,7 @@ void ChannelImpl::push_scope(std::string name) { mgb_assert(check_available(), "Channel already closed"); auto& state = get_channel_state(); state.stack_manager.enter(name); - RECORD_EVENT(ScopeEvent, name); + MGB_RECORD_EVENT(ScopeEvent, name); m_buffer.enqueue(PushScope{name}); } @@ -1356,7 +1350,7 @@ void ChannelImpl::pop_scope(std::string name) { mgb_assert(check_available(), "Channel already closed"); auto& state = get_channel_state(); state.stack_manager.exit(name); - RECORD_EVENT(ScopeFinishEvent, name); + MGB_RECORD_EVENT(ScopeFinishEvent, name); m_buffer.enqueue(PopScope{name}); } @@ -1376,9 +1370,9 @@ void ChannelImpl::sample_on_device(CompNode device, bool force) { return; } } - RECORD_EVENT(SampleDeviceEvent, device); + MGB_RECORD_EVENT(SampleDeviceEvent, device); auto [total, free] = device.get_mem_status_bytes(); - RECORD_EVENT(SampleDeviceFinishEvent, device, total, free); + MGB_RECORD_EVENT(SampleDeviceFinishEvent, device, total, free); } void ChannelImpl::DynamicSublinear::pin(const SmallVector& vec) { diff --git a/imperative/src/impl/physical_tensor.cpp b/imperative/src/impl/physical_tensor.cpp index 5f675347..c7f11d4d 100644 --- a/imperative/src/impl/physical_tensor.cpp +++ b/imperative/src/impl/physical_tensor.cpp @@ -11,10 +11,13 @@ #include "megbrain/imperative.h" #include "megbrain/imperative/blob_manager.h" +#include "megbrain/imperative/profiler.h" #include "./event_pool.h" #include "./async_releaser.h" +#include "./profiler/events.h" + #include namespace mgb { @@ -128,9 +131,11 @@ Tensor::Tensor(BlobPtr blob, const TensorLayout& layout, size_t offset, const Ho Tensor::Tensor(const HostTensorND &hv) : Tensor(hv.layout(), hv.comp_node()) { m_value = hv; + MGB_RECORD_EVENT(profiler::HostToDeviceEvent, hv.layout(), hv.comp_node(), hv.raw_ptr(), dev_tensor().raw_ptr()); dev_tensor().copy_from_fixlayout(hv); // even though hv is saved in m_value, Tensor itself could be // released before copy completes + MGB_RECORD_EVENT(profiler::HostToDeviceFinishEvent, hv.layout(), hv.comp_node(), hv.raw_ptr(), dev_tensor().raw_ptr()); AsyncReleaser::inst()->add(hv); } diff --git a/imperative/src/impl/profiler.cpp b/imperative/src/impl/profiler.cpp index 8a7c26d2..f8c17136 100644 --- a/imperative/src/impl/profiler.cpp +++ b/imperative/src/impl/profiler.cpp @@ -37,11 +37,12 @@ std::shared_ptr Timer::record_device(CompNode device) { return event; } +std::vector Profiler::sm_records; Profiler::options_t Profiler::sm_profile_options; std::mutex Profiler::sm_mutex; std::unordered_map Profiler::sm_profilers; Timer Profiler::sm_timer; -profiler::HostTime Profiler::sm_start_at; +profiler::HostTime Profiler::sm_start_at = profiler::HostTime::min(); std::atomic_uint64_t Profiler::sm_last_id = 0; bool Profiler::sm_profiling = false; thread_local std::unique_ptr Profiler::tm_profiler = std::make_unique(); @@ -50,13 +51,16 @@ std::atomic_size_t Profiler::sm_preferred_capacity; auto Profiler::get_thread_dict() -> thread_dict_t { thread_dict_t thread_dict; for (auto&& [tid, profiler]: sm_profilers) { - thread_dict[tid] = profiler->m_thread_name; + thread_dict[tid] = sys::get_thread_name(tid); } return thread_dict; } void Profiler::dump_profile(std::string basename, std::string format, bundle_t result) { - std::unordered_map format_table; + static std::unordered_map format_table = { + {"chrome_timeline.json", profiler::dump_chrome_timeline}, + {"memory_flow.svg", profiler::dump_memory_flow}, + }; auto iter = format_table.find(format); if (iter == format_table.end()) { mgb_log_error("unsupported profiling format %s", format.c_str()); diff --git a/imperative/src/impl/profiler/chrome_timeline.cpp b/imperative/src/impl/profiler/chrome_timeline.cpp index c315c6dd..3ee8a192 100644 --- a/imperative/src/impl/profiler/chrome_timeline.cpp +++ b/imperative/src/impl/profiler/chrome_timeline.cpp @@ -86,10 +86,6 @@ public: m_args[key] = value; return *this; } - ChromeTraceEvent& arg(std::string key, nlohmann::json value) { - m_args[key] = value; - return *this; - } ChromeTraceEvent& stack(Trace trace) { m_stack = std::move(trace); return *this; @@ -163,7 +159,7 @@ public: return m_content.back(); } - std::string metadata(std::string key) { + std::string& metadata(std::string key) { return m_metadata[key]; } @@ -184,8 +180,8 @@ public: std::string to_string() const { auto json = to_json(); - return "{" "traceEvents:" + nlohmann::to_string(json["traceEvents"]) + "," - "metadata:" + nlohmann::to_string(json["metadata"]) + "}"; + return "{" "\"traceEvents\":" + nlohmann::to_string(json["traceEvents"]) + "," + "\"metadata\":" + nlohmann::to_string(json["metadata"]) + "}"; } private: std::vector m_content; @@ -360,20 +356,40 @@ struct ChromeTimelineEventVisitor: EventVisitor { new_host_event("AutoEvict", 'B'); } else if constexpr (std::is_same_v) { new_host_event("AutoEvict", 'E'); + } else if constexpr (std::is_same_v) { + new_device_event("HostToDevice", 'B', event.device); + } else if constexpr (std::is_same_v) { + new_device_event("HostToDevice", 'E', event.device) + .arg("shape", event.layout.TensorShape::to_string()) + .arg("dtype", event.layout.dtype.name()) + .arg("nr_elements", event.layout.total_nr_elems()) + .arg("device", event.device.to_string()); } } void notify_counter(std::string key, int64_t old_val, int64_t new_val) { new_host_event(key, 'C').arg("value", new_val); } + + void name_threads(Profiler::thread_dict_t thread_dict) { + for (auto&& [tid, tname]: thread_dict) { + trace_events.new_event() + .name("thread_name") + .pid('M') + .tid(to_tid(tid)) + .arg("name", tname); + } + } }; void dump_chrome_timeline(std::string filename, Profiler::bundle_t result){ ChromeTimelineEventVisitor visitor; visitor.process_events(result); - visitor.trace_events.metadata("localTime") = std::to_string(result.start_at.time_since_epoch().count()); - std::string json_repr = visitor.trace_events.to_string(); + visitor.name_threads(result.thread_dict); + auto trace_events = std::move(visitor.trace_events); + trace_events.metadata("localTime") = std::to_string(result.start_at.time_since_epoch().count()); + std::string json_repr = trace_events.to_string(); mgb::debug::write_to_file(filename.c_str(), json_repr); } diff --git a/imperative/src/impl/profiler/states.h b/imperative/src/impl/profiler/states.h index fe8682aa..d50898b4 100644 --- a/imperative/src/impl/profiler/states.h +++ b/imperative/src/impl/profiler/states.h @@ -234,7 +234,8 @@ public: SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent, TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent, - CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent> converter; + CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent, + HostToDeviceEvent, HostToDeviceFinishEvent> converter; auto for_each_entry = [&](auto&& handler) { for (auto& entry: bundle.entries) { @@ -308,6 +309,7 @@ public: if constexpr (is_op_event::value) { current_op = &m_operators.at(event.op_id); } else if constexpr (is_tensor_event::value) { + mgb_assert(m_tensors.count(event.tensor_id) != 0, "tensor not found"); current_tensor = &m_tensors.at(event.tensor_id); } if constexpr (std::is_same_v) { diff --git a/imperative/src/impl/profiler_plugin.cpp b/imperative/src/impl/profiler_plugin.cpp index 6a0525a5..64b91948 100644 --- a/imperative/src/impl/profiler_plugin.cpp +++ b/imperative/src/impl/profiler_plugin.cpp @@ -31,6 +31,19 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { if (m_opr_dict.empty() && m_var_dict.empty()) { init_seq(event.exec); } + Profiler::record("Constants"); + for (auto&& [var, var_info]: m_var_dict) { + if (var_info->is_const) { + bool valid = var->dev_tensor_valid(); + auto layout = valid ? var->layout() : TensorLayout(); + var_info->id = Profiler::next_id(); + Profiler::record(var_info->id, var->name()); + Profiler::record(var_info->id, layout, var->comp_node(), valid ? var->dev_tensor().raw_ptr() : nullptr); + } else { + var_info->rt_ref_cnt = var_info->ref_cnt; + } + } + Profiler::record("Constants"); Profiler::record("DispatchOprs"); event.exec->iter_opr_seq([this](OperatorNodeBase* opr) -> bool{ auto& opr_info = get_opr_info(opr); @@ -44,26 +57,15 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { } auto opr_name = opr->dyn_typeinfo()->name; auto copy_params = [params = opr_info.params] { return *params; }; - Profiler::record(opr_info.id, opr_name, copy_params, inputs, outputs); for (auto output: opr->output()) { - auto var_id = get_var_info(output).id; - Profiler::record(var_id); + auto& var_id = get_var_info(output).id; + var_id = Profiler::next_id(); + Profiler::record(var_id, output->name()); } + Profiler::record(opr_info.id, opr_name, copy_params, inputs, outputs); return true; }); Profiler::record("DispatchOprs"); - Profiler::record("Constants"); - for (auto&& [var, var_info]: m_var_dict) { - if (var_info->is_const) { - bool valid = var->dev_tensor_valid(); - auto layout = valid ? var->layout() : TensorLayout(); - Profiler::record(var_info->id); - Profiler::record(var_info->id, layout, var->comp_node(), valid ? var->dev_tensor().raw_ptr() : nullptr); - } else { - var_info->rt_ref_cnt = var_info->ref_cnt; - } - } - Profiler::record("Constants"); }; auto on_opr_start = [this](OprExecStart const& event) { OperatorNodeBase* opr = event.opr; @@ -144,6 +146,7 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { Profiler::record(var_info->id); } Profiler::record(var_info->id, var_info->ref_cnt); + var_info->id = 0; } }; add_event_handler(graph->event().register_receiver(on_seq_start)); @@ -194,11 +197,12 @@ ProfilerPlugin::OprInfo& ProfilerPlugin::register_opr(cg::OperatorNodeBase *opr) ProfilerPlugin::VarInfo& ProfilerPlugin::register_var(cg::VarNode *var) { auto info = std::make_unique(); - info->id = Profiler::next_id(); + info->id = 0; info->is_const = false; info->ref_cnt = 0; info->rt_ref_cnt = 0; - return *m_var_dict.insert({var, std::move(info)}).first->second; + mgb_assert(m_var_dict.count(var) == 0, "var exists"); + return *(m_var_dict[var] = std::move(info)); } ProfilerPlugin::OprInfo& ProfilerPlugin::get_opr_info(cg::OperatorNodeBase *opr) { diff --git a/imperative/src/include/megbrain/imperative/profiler.h b/imperative/src/include/megbrain/imperative/profiler.h index 1a418292..b33745d4 100644 --- a/imperative/src/include/megbrain/imperative/profiler.h +++ b/imperative/src/include/megbrain/imperative/profiler.h @@ -81,10 +81,9 @@ public: private: std::thread::id m_thread_id; std::vector m_records; - std::vector m_duration_stack; std::atomic m_status = Running; - std::string m_thread_name; + static std::vector sm_records; static options_t sm_profile_options; static std::mutex sm_mutex; static std::unordered_map sm_profilers; @@ -99,9 +98,6 @@ public: Profiler() { m_thread_id = std::this_thread::get_id(); MGB_LOCK_GUARD(sm_mutex); - if (sm_profilers.size() == 0) { - reset(); - } mgb_assert(sm_profilers.count(m_thread_id) == 0); sm_profilers[m_thread_id] = this; } @@ -109,17 +105,13 @@ public: MGB_LOCK_GUARD(sm_mutex); mgb_assert(sm_profilers.count(m_thread_id) == 1); sm_profilers.erase(m_thread_id); + sm_records.insert(sm_records.end(), m_records.begin(), m_records.end()); } public: static Profiler& get_instance() { return *tm_profiler; } - static void reset() { - mgb_assert(sm_profilers.size() == 0, "profiler already running"); - sm_start_at = profiler::HostTime::min(); - } - static uint64_t next_id() { return sm_last_id++; } @@ -151,12 +143,10 @@ public: mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting)); } } - std::vector profile_data; + std::vector profile_data = std::move(sm_records); for (auto&& [tid, profiler]: sm_profilers) { sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); - for (auto& record: profiler->m_records) { - profile_data.push_back(std::move(record)); - } + profile_data.insert(profile_data.end(), profiler->m_records.begin(), profiler->m_records.end()); profiler->m_records.clear(); profiler->m_records.reserve(sm_preferred_capacity); } @@ -238,5 +228,10 @@ public: } }; +#define MGB_RECORD_EVENT(type, ...) \ + if (mgb::imperative::Profiler::is_profiling()) { \ + mgb::imperative::Profiler::record(type{__VA_ARGS__}); \ + } \ + } // namespace imperative } // namespace mgb