@@ -39,6 +39,9 @@ endif() | |||
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3) | |||
target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | |||
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/Json ${PROJECT_BINARY_DIR}/third_party/Json) | |||
target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json) | |||
target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR}) | |||
target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME}) | |||
target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter) | |||
@@ -1013,9 +1013,8 @@ void init_tensor(py::module m) { | |||
interpreter_for_py->sync(); | |||
imperative::Profiler::stop_profile(); | |||
auto results = imperative::Profiler::collect(); | |||
auto options = imperative::Profiler::get_options(); | |||
return [results=std::move(results), options=std::move(options)](std::string basename, std::string format){ | |||
imperative::Profiler::dump_profile(basename, format, results, options); | |||
return [results=std::move(results)](std::string basename, std::string format){ | |||
imperative::Profiler::dump_profile(basename, format, results); | |||
}; | |||
}, py::call_guard<py::gil_scoped_release>()); | |||
m.def("sync", | |||
@@ -19,6 +19,7 @@ | |||
#include "megbrain/imperative/op_def.h" | |||
#include "megbrain/imperative/utils/to_string.h" | |||
#include "./stack_manager.h" | |||
#include "./tensor_info.h" | |||
namespace mgb::imperative { | |||
@@ -193,7 +194,7 @@ struct PopScope { | |||
} | |||
}; | |||
using Command = std::variant<Put, | |||
using CommandData = std::variant<Put, | |||
ApplyOp, | |||
Del, | |||
GetValue, | |||
@@ -206,14 +207,20 @@ using Command = std::variant<Put, | |||
PushScope, | |||
PopScope>; | |||
using IdentifiedCommand = std::pair<uint64_t, Command>; | |||
struct Command { | |||
uint64_t id; | |||
CommandData data; | |||
StackManager::Trace trace; | |||
}; | |||
// using IdentifiedCommand = std::pair<uint64_t, Command>; | |||
} | |||
template <> | |||
struct ToStringTrait<interpreter::intl::Command>{ | |||
std::string operator()(const interpreter::intl::Command& cmd) const { | |||
return std::visit([](const auto& cmd){ | |||
std::string content = std::visit([](const auto& cmd){ | |||
std::string result = cmd.get_name(); | |||
result += "{"; | |||
cmd.get_props([&](const char* key, auto&& value) { | |||
@@ -224,7 +231,8 @@ struct ToStringTrait<interpreter::intl::Command>{ | |||
}); | |||
result += "}"; | |||
return result; | |||
}, cmd); | |||
}, cmd.data); | |||
return content; | |||
} | |||
}; | |||
@@ -127,9 +127,8 @@ Handle ChannelImpl::put(const HostTensorND& value, bool no_cache) { | |||
MGB_LOCK_GUARD(m_spin); | |||
mgb_assert(check_available(), "Channel already closed"); | |||
auto& state = get_channel_state(); | |||
state.scopes.push("Put"); | |||
auto _ = StackManager::Guard{"Put", &state.stack_manager}; | |||
auto info = put_impl(value, no_cache); | |||
state.scopes.pop("Put"); | |||
return info; | |||
} | |||
@@ -158,16 +157,15 @@ Handle ChannelImpl::put(const DeviceTensorND& data, const HostTensorND& hvalue) | |||
} | |||
TensorInfo* ChannelImpl::put_impl(const DeviceTensorND& data, const HostTensorND& hvalue) { | |||
auto& state = get_channel_state(); | |||
state.scopes.push("Put"); | |||
auto _ = StackManager::Guard{"Put", &state.stack_manager}; | |||
auto info = alloc(); | |||
RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandEvent::Put); | |||
RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandKind::Put); | |||
init(info, {data.layout(), data.comp_node()}); | |||
info->mem_desc.id = StorageIdentifier::make(++m_storage_id); | |||
info->ptr = Tensor::make(data, hvalue); | |||
RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr()); | |||
info->status = TensorInfo::Produced; | |||
RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandFinishEvent::Put); | |||
state.scopes.pop("Put"); | |||
RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandKind::Put); | |||
return info; | |||
} | |||
@@ -230,7 +228,7 @@ void ChannelImpl::dispatch_default_cpu( | |||
auto& state = get_channel_state(); | |||
auto name = op->trait()->make_name(*op); | |||
state.scopes.push(name); | |||
auto _ = StackManager::Guard(name, &state.stack_manager); | |||
auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); | |||
RECORD_EVENT(ShapeInferEvent, validated); | |||
@@ -291,9 +289,9 @@ void ChannelImpl::dispatch_default_cpu( | |||
} | |||
return op_info; | |||
}; | |||
RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, tinfo_to_tid(input_infos), tinfo_to_tid(output_infos)); | |||
state.scopes.pop(name); | |||
RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, | |||
tinfo_to_tid(input_infos), tinfo_to_tid(output_infos), | |||
state.stack_manager.dump()); | |||
} | |||
void ChannelImpl::dispatch_kernel( | |||
@@ -305,7 +303,7 @@ void ChannelImpl::dispatch_kernel( | |||
auto& options = state.options; | |||
auto name = op->trait()->make_name(*op); | |||
state.scopes.push(name); | |||
auto _ = StackManager::Guard{name, &state.stack_manager}; | |||
auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); | |||
RECORD_EVENT(ShapeInferEvent, validated); | |||
@@ -334,7 +332,9 @@ void ChannelImpl::dispatch_kernel( | |||
} | |||
return op_info; | |||
}; | |||
RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs)); | |||
RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, | |||
tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs), | |||
state.stack_manager.dump()); | |||
m_buffer.enqueue(std::move(cmd)); | |||
if (!validated && options.async_level == 1) { | |||
sync_impl(); | |||
@@ -346,7 +346,6 @@ void ChannelImpl::dispatch_kernel( | |||
info->ptr->comp_node().sync(); | |||
} | |||
} | |||
state.scopes.pop(name); | |||
} | |||
SmallVector<Handle> ChannelImpl::apply_op( | |||
@@ -505,7 +504,8 @@ TensorInfo* ChannelImpl::alloc() { | |||
}(); | |||
info->id = Profiler::next_id(); | |||
if (Profiler::is_profiling()) { | |||
info->name = state.scopes.next_tensor_name(); | |||
size_t tensor_id = state.stack_manager.current()->next_id("tensor"); | |||
info->name = state.stack_manager.dump().to_string() + ssprintf(":%zu", tensor_id); | |||
} | |||
return info; | |||
} | |||
@@ -554,7 +554,7 @@ void ChannelImpl::free(TensorInfo* ptr) { | |||
} | |||
void ChannelImpl::recursive_free(TensorInfo* ptr) { | |||
RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandEvent::RecFree); | |||
RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandKind::RecFree); | |||
SmallVector<TensorInfo*> inps; | |||
if (ptr->producer) { | |||
for (auto i : ptr->producer->inputs) { | |||
@@ -569,7 +569,7 @@ void ChannelImpl::recursive_free(TensorInfo* ptr) { | |||
recursive_free(i); | |||
} | |||
} | |||
RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandFinishEvent::RecFree); | |||
RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandKind::RecFree); | |||
} | |||
void ChannelImpl::real_free(TensorInfo* ptr) { | |||
@@ -625,9 +625,9 @@ void ChannelImpl::regenerate(TensorInfo* dest) { | |||
m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest}); | |||
if (!m_applying) flush_apply_stack(); | |||
} else if (dest->evict_type == EvictType::SWAP) { | |||
RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandEvent::ReGen); | |||
RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen); | |||
produce_tensor(dest, Tensor::make(dest->h_value)); | |||
RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandFinishEvent::ReGen); | |||
RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandKind::ReGen); | |||
} | |||
} | |||
@@ -721,22 +721,24 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { | |||
// refcnt --, owners: [tensor_inputs] | |||
// if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor | |||
uint64_t del_id = del->id; | |||
RECORD_EVENT(OpDelEvent, del_id); | |||
RECORD_EVENT(TensorCommandEvent, del_id, TensorCommandKind::Del); | |||
free(del); | |||
RECORD_EVENT(OpDelFinishEvent, del_id); | |||
RECORD_EVENT(TensorCommandFinishEvent, del_id, TensorCommandKind::Del); | |||
} | |||
// Before wait | |||
//TODO: split operator wait and execute so that OpWait could be corrected recorded. | |||
// Before execute | |||
for (auto&& [device, kernel_id]: kernels) { | |||
RECORD_EVENT(KernelExecuteEvent, apply_id, kernel_id, Timer::record_event(device)); | |||
RECORD_EVENT(KernelLaunchEvent, apply_id, kernel_id, device); | |||
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); | |||
} | |||
// Apply op | |||
// Here std::move is REQUIRED for removing duplicated references. | |||
auto outputs = apply_on_physical_tensor(apply_on_physical_tensor, *cmd.op, inputs); | |||
// After execute | |||
for (auto&& [device, kernel_id]: kernels) { | |||
RECORD_EVENT(KernelExecuteFinishEvent, apply_id, kernel_id, Timer::record_event(device)); | |||
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); | |||
RECORD_EVENT(KernelLaunchFinishEvent, apply_id, kernel_id, device); | |||
} | |||
// End profiling operator | |||
mgb_assert(outputs.size() == cmd.outputs.size()); | |||
@@ -787,7 +789,7 @@ void ChannelImpl::flush_apply_stack() { | |||
m_dtr.pin(cmd.inputs); | |||
} | |||
if (recomp) { | |||
RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandEvent::ReGen); | |||
RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandKind::ReGen); | |||
} | |||
} | |||
bool regen = false; | |||
@@ -810,7 +812,7 @@ void ChannelImpl::flush_apply_stack() { | |||
m_apply_stack.pop(); | |||
do_apply_op(cmd_backup); | |||
if (recomp_backup) { | |||
RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandFinishEvent::ReGen); | |||
RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen); | |||
for (auto o : cmd_backup.outputs) { | |||
if (o) { | |||
m_dtr.update_dsu_after_recompute(o); | |||
@@ -902,7 +904,7 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) { | |||
check_worker_exc_unsafe(); | |||
return require_host ? host_available() : static_cast<bool>(info->ptr); | |||
}); | |||
RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop, m_waitee == nullptr); | |||
RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop); | |||
m_waitee = nullptr; | |||
return info->ptr; | |||
} | |||
@@ -1003,7 +1005,7 @@ std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPt | |||
return {outputs_desc, alloc_storage(outputs_desc), alloc_storage(workspaces_desc)}; | |||
} | |||
void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||
void ChannelImpl::process_one_task(Command& icmd) { | |||
using namespace ranges; | |||
using namespace ranges::views; | |||
auto& state = get_worker_state(); | |||
@@ -1012,10 +1014,12 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||
auto cmd_visitor = [&](const auto& cmd) { | |||
using T = std::decay_t<decltype(cmd)>; | |||
if constexpr (std::is_same_v<T, Put>) { | |||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Put); | |||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Put); | |||
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); | |||
auto value = cmd.no_cache ? std::make_shared<Tensor>(cmd.value) : Tensor::make(cmd.value); | |||
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); | |||
produce_tensor(cmd.dest, std::move(value)); | |||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Put); | |||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Put); | |||
sample_on_device(cmd.dest->desc.comp_node, false); | |||
} else if constexpr (std::is_same_v<T, ApplyOp>) { | |||
for (auto& i : cmd.inputs) { | |||
@@ -1084,11 +1088,11 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||
} | |||
} | |||
} else if constexpr (std::is_same_v<T, Del>) { | |||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Del); | |||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Del); | |||
CompNode device = cmd.dest->desc.comp_node; | |||
uint64_t tensor_id = cmd.dest->id; | |||
free(cmd.dest); | |||
RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandFinishEvent::Del); | |||
RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandKind::Del); | |||
sample_on_device(device, false); | |||
} else if constexpr (std::is_same_v<T, GetValue>) { | |||
if (cmd.dest->invalid) return; | |||
@@ -1102,26 +1106,26 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||
imperative_log_profile_end("GetValue"); | |||
} else if constexpr (std::is_same_v<T, SwapIn>) { | |||
if (cmd.dest->invalid) return; | |||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapIn); | |||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapIn); | |||
produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value)); | |||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapIn); | |||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapIn); | |||
sample_on_device(cmd.dest->desc.comp_node, false); | |||
} else if constexpr (std::is_same_v<T, SwapOut>) { | |||
if (cmd.dest->invalid) return; | |||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapOut); | |||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapOut); | |||
cmd.dest->h_value = cmd.dest->ptr->get_value(); | |||
if (cmd.dest->evict_type == EvictType::NONE) { | |||
cmd.dest->evict_type = EvictType::SWAP; | |||
cmd.dest->status = TensorInfo::Swapped; | |||
release_tensor(cmd.dest); | |||
} | |||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapOut); | |||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapOut); | |||
sample_on_device(cmd.dest->desc.comp_node, false); | |||
} else if constexpr (std::is_same_v<T, Drop>) { | |||
if (cmd.dest->invalid) return; | |||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Drop); | |||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Drop); | |||
do_drop(cmd.dest, true); | |||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Drop); | |||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Drop); | |||
} else if constexpr (std::is_same_v<T, SetOption>) { | |||
options.set_option(cmd.key, cmd.value); | |||
} else if constexpr (std::is_same_v<T, StartProfile>) { | |||
@@ -1138,6 +1142,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||
if (Profiler::get_option("sample_rate", 0)) { | |||
sample_on_device(device, true); | |||
} | |||
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); | |||
}); | |||
RECORD_EVENT(StartProfileFinishEvent); | |||
} else if constexpr (std::is_same_v<T, StopProfile>) { | |||
@@ -1186,7 +1191,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||
notify_tensor_unsafe(m_waitee); | |||
} | |||
} | |||
}, icmd.second); | |||
}, icmd.data); | |||
} | |||
void ChannelImpl::check_worker_exc_unsafe() { | |||
@@ -1203,12 +1208,13 @@ void ChannelImpl::check_worker_exc_unsafe() { | |||
} | |||
} | |||
void ChannelImpl::CommandBuffer::enqueue(Command cmd) { | |||
void ChannelImpl::CommandBuffer::enqueue(CommandData cmd) { | |||
auto& state = m_owner->get_channel_state(); | |||
if (std::get_if<Del>(&cmd) && fuse_del(std::get<Del>(cmd))) { | |||
return; | |||
} | |||
// mgb_log_debug("%s Enqueued", to_string(cmd).c_str()); | |||
m_commands.push_back(std::move(cmd)); | |||
m_commands.push_back({Profiler::next_id(), std::move(cmd), state.stack_manager.dump()}); | |||
auto flush_pos = flush_pos_for(m_commands.back()); | |||
flush(flush_pos); | |||
} | |||
@@ -1222,7 +1228,7 @@ void ChannelImpl::CommandBuffer::flush(Handle pos) { | |||
if (Profiler::is_profiling()) { | |||
mgb_log_debug("%s Flushed", to_string(*iter).c_str()); | |||
} | |||
m_owner->m_worker.add_task(IdentifiedCommand{Profiler::next_id(), std::move(*iter)}); | |||
m_owner->m_worker.add_task(std::move(*iter)); | |||
} | |||
m_commands.erase(m_commands.begin(), pos); | |||
} | |||
@@ -1248,7 +1254,7 @@ auto ChannelImpl::CommandBuffer::flush_pos_for(const Command& cmd) -> Handle { | |||
return m_commands.begin() + (m_commands.size() - buffer_length); | |||
} | |||
return m_commands.begin(); | |||
}, cmd); | |||
}, cmd.data); | |||
} | |||
/** | |||
@@ -1261,7 +1267,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) { | |||
// TODO: eliminate Puts | |||
auto begin = m_commands.begin(), end = m_commands.end(); | |||
auto apply_iter = std::find_if(begin, end, [dest](const Command& cmd){ | |||
if (auto* apply = std::get_if<ApplyOp>(&cmd)) { | |||
if (auto* apply = std::get_if<ApplyOp>(&cmd.data)) { | |||
return std::count(apply->inputs.begin(), apply->inputs.end(), dest) > 0; | |||
} | |||
return false; | |||
@@ -1270,7 +1276,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) { | |||
return false; | |||
} | |||
// mgb_log_debug("%s Fused", to_string(Command{cmd}).c_str()); | |||
std::get<ApplyOp>(*apply_iter).dels.push_back(dest); | |||
std::get<ApplyOp>(apply_iter->data).dels.push_back(dest); | |||
return true; | |||
} | |||
@@ -1297,7 +1303,7 @@ auto ChannelImpl::CommandBuffer::find_last_usage(TensorInfo* dest, Range range) | |||
found = iter; | |||
} | |||
} | |||
}, *iter); | |||
}, iter->data); | |||
}; | |||
return found; | |||
} | |||
@@ -1313,7 +1319,7 @@ auto ChannelImpl::CommandBuffer::find_produce(TensorInfo* dest, Range range) | |||
return cmd.dest == dest; | |||
} | |||
return false; | |||
}, cmd); | |||
}, cmd.data); | |||
}); | |||
} | |||
@@ -1340,7 +1346,7 @@ void ChannelImpl::push_scope(std::string name) { | |||
MGB_LOCK_GUARD(m_spin); | |||
mgb_assert(check_available(), "Channel already closed"); | |||
auto& state = get_channel_state(); | |||
state.scopes.push(name); | |||
state.stack_manager.enter(name); | |||
RECORD_EVENT(ScopeEvent, name); | |||
m_buffer.enqueue(PushScope{name}); | |||
} | |||
@@ -1349,7 +1355,7 @@ void ChannelImpl::pop_scope(std::string name) { | |||
MGB_LOCK_GUARD(m_spin); | |||
mgb_assert(check_available(), "Channel already closed"); | |||
auto& state = get_channel_state(); | |||
state.scopes.pop(name); | |||
state.stack_manager.exit(name); | |||
RECORD_EVENT(ScopeFinishEvent, name); | |||
m_buffer.enqueue(PopScope{name}); | |||
} | |||
@@ -26,6 +26,7 @@ | |||
#include "./commands.h" | |||
#include "./tensor_info.h" | |||
#include "./option_manager.h" | |||
#include "./stack_manager.h" | |||
#include "../profiler/events.h" | |||
@@ -94,7 +95,7 @@ private: | |||
TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop); | |||
void notify_tensor_unsafe(TensorInfo* info); | |||
void process_one_task(IdentifiedCommand&); | |||
void process_one_task(Command&); | |||
void check_worker_exc_unsafe(); | |||
@@ -129,10 +130,10 @@ private: | |||
void assert_in_worker(); | |||
std::thread::id get_worker_tid(); | |||
template <typename TCommand> | |||
void enqueue_command(TCommand&& cmd) { | |||
m_buffer.enqueue(Command{std::forward<TCommand>(cmd)}); | |||
} | |||
// template <typename TCommand> | |||
// void enqueue_command(TCommand&& cmd) { | |||
// m_buffer.enqueue(Command{std::forward<TCommand>(cmd)}); | |||
// } | |||
void sample_on_device(CompNode device, bool force); | |||
@@ -153,13 +154,13 @@ private: | |||
bool m_applying = false; | |||
bool m_closed = false; | |||
struct WorkQueue : AsyncQueueSC<IdentifiedCommand, WorkQueue> { | |||
struct WorkQueue : AsyncQueueSC<Command, WorkQueue> { | |||
// set max_spin=0 to prevent Queue fetch task in busy wait manner. | |||
// this won't affect throughput when python interpreter is sending enough task, | |||
// but will significantly save CPU time when waiting for task, e.g. wait for data input | |||
// limit pending tasks to 10000 | |||
WorkQueue(ChannelImpl* owner) | |||
: AsyncQueueSC<IdentifiedCommand, WorkQueue>(0, 10000), m_owner(owner) { | |||
: AsyncQueueSC<Command, WorkQueue>(0, 10000), m_owner(owner) { | |||
sys::set_thread_name("interpreter"); | |||
if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) { | |||
int len = strlen(env_val); | |||
@@ -171,7 +172,7 @@ private: | |||
update_max_items(val); | |||
} | |||
} | |||
void process_one_task(IdentifiedCommand& icmd) { | |||
void process_one_task(Command& icmd) { | |||
m_owner->process_one_task(icmd); | |||
} | |||
void on_async_queue_worker_thread_start() override; | |||
@@ -193,7 +194,7 @@ private: | |||
*/ | |||
struct CommandBuffer { | |||
CommandBuffer(ChannelImpl* owner) : m_owner(owner) {} | |||
void enqueue(Command cmd); | |||
void enqueue(CommandData cmd); | |||
bool empty() const { | |||
return m_commands.empty(); | |||
} | |||
@@ -224,91 +225,13 @@ private: | |||
//! level 0: both sync. | |||
int m_async_level = 2; | |||
struct Scope { | |||
std::string name; | |||
std::unordered_map<std::string, std::unique_ptr<Scope>> children; | |||
size_t version = 0; | |||
size_t parent_version = 0; | |||
size_t tensor_count = 0; | |||
Scope* active_child = nullptr; | |||
Scope* parent = nullptr; | |||
Scope* enter(std::string name) { | |||
auto& child = children[name]; | |||
if (!child) { | |||
child = std::make_unique<Scope>(); | |||
child->name = name; | |||
child->parent = this; | |||
} | |||
if (version != child->parent_version) { | |||
child->version = 0; | |||
child->parent_version = version; | |||
} else { | |||
child->version++; | |||
} | |||
child->tensor_count = 0; | |||
return active_child = child.get(); | |||
} | |||
Scope* exit(std::string name) { | |||
mgb_assert(this->name == name, "scope name mismatch"); | |||
parent->active_child = nullptr; | |||
return parent; | |||
} | |||
}; | |||
class ScopeManager { | |||
private: | |||
Scope m_root; | |||
Scope* m_current_scope = &m_root; | |||
public: | |||
class ScopeGuard{ | |||
private: | |||
ScopeManager* m_manager; | |||
std::string m_name; | |||
public: | |||
ScopeGuard(ScopeManager* manager, std::string name): m_manager{manager}, m_name{name} { | |||
m_manager->push(m_name); | |||
} | |||
~ScopeGuard() { | |||
m_manager->pop(m_name); | |||
} | |||
}; | |||
void push(std::string name) { | |||
m_current_scope = m_current_scope->enter(name); | |||
} | |||
void pop(std::string name) { | |||
m_current_scope = m_current_scope->exit(name); | |||
} | |||
std::string next_tensor_name() { | |||
std::string builder; | |||
Scope* scope = &m_root; | |||
while (true) { | |||
builder.append(scope->name); | |||
if (scope->version != 0) { | |||
builder.append(ssprintf("(%ld)", scope->version)); | |||
} | |||
if (scope != &m_root) { | |||
builder.append("."); | |||
} | |||
if (scope->active_child == nullptr) { | |||
builder.append(ssprintf(":%%%ld", scope->tensor_count++)); | |||
break; | |||
} else { | |||
scope = scope->active_child; | |||
} | |||
} | |||
return builder; | |||
} | |||
}; | |||
struct State { | |||
std::thread::id tid; | |||
OptionManager options; | |||
}; | |||
struct ChannelState: State { | |||
ScopeManager scopes; | |||
StackManager stack_manager; | |||
}; | |||
struct WorkerState: State {}; | |||
@@ -0,0 +1,188 @@ | |||
/** | |||
* \file imperative/src/impl/interpreter/stack_manager.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#pragma once | |||
#include <string> | |||
#include <memory> | |||
#include <unordered_map> | |||
#include "megbrain/utils/metahelper.h" | |||
#include "megbrain/utils/small_vector.h" | |||
namespace mgb::imperative::interpreter::intl{ | |||
class StackSnapshot; | |||
class StackManager: public NonCopyableObj { | |||
public: | |||
class Node; | |||
class Guard; | |||
struct Frame; | |||
class Trace; | |||
private: | |||
std::unique_ptr<Node> m_root = nullptr; | |||
Node* m_current = nullptr; | |||
SmallVector<uint64_t> m_trace_id_stack; | |||
uint64_t m_last_trace_id = 0; | |||
public: | |||
StackManager(); | |||
std::pair<Node*, uint64_t> enter(std::string name); | |||
void exit(std::string name); | |||
Trace dump(); | |||
Node* current(); | |||
}; | |||
class StackManager::Node: public NonCopyableObj { | |||
private: | |||
std::string m_name; | |||
std::unordered_map<std::string, std::unique_ptr<Node>> m_children; | |||
std::unordered_map<std::string, size_t> m_id_table; | |||
Node* m_parent = nullptr; | |||
int64_t m_depth = -1; | |||
uint64_t m_version = 0; | |||
explicit Node(std::string name, Node* parent): m_name{name}, m_parent{parent} { | |||
if (parent) { | |||
m_depth = parent->m_depth + 1; | |||
} | |||
} | |||
public: | |||
const std::string& name() const { | |||
return m_name; | |||
} | |||
Node* operator[](const std::string& name) { | |||
auto& child = m_children[name]; | |||
if (child == nullptr) { | |||
child.reset(new Node(name, this)); | |||
} | |||
return child.get(); | |||
} | |||
Node* parent() { | |||
return m_parent; | |||
} | |||
bool is_root() { | |||
return m_parent == nullptr; | |||
} | |||
uint64_t version() const { | |||
return m_version; | |||
} | |||
void update_version() { | |||
++m_version; | |||
for (auto&& [key, child]: m_children) { | |||
child->reset_version(); | |||
} | |||
m_id_table.clear(); | |||
} | |||
void reset_version() { | |||
m_version = 0; | |||
m_id_table.clear(); | |||
} | |||
int64_t depth() const { | |||
return m_depth; | |||
} | |||
uint64_t next_id(std::string key) { | |||
return m_id_table[key]++; | |||
} | |||
static std::unique_ptr<Node> make() { | |||
return std::unique_ptr<Node>(new Node("", nullptr)); | |||
} | |||
}; | |||
class StackManager::Guard { | |||
private: | |||
std::string m_name; | |||
StackManager* m_manager; | |||
public: | |||
Guard(std::string name, StackManager* manager): m_name{name}, m_manager{manager}{ | |||
if (m_manager) { | |||
m_manager->enter(name); | |||
} | |||
} | |||
~Guard() { | |||
release(); | |||
} | |||
void release() { | |||
if (m_manager) { | |||
m_manager->exit(m_name); | |||
m_manager = nullptr; | |||
} | |||
} | |||
}; | |||
struct StackManager::Frame { | |||
StackManager::Node* node; | |||
uint64_t version; | |||
}; | |||
class StackManager::Trace { | |||
private: | |||
SmallVector<StackManager::Frame> m_frames; | |||
uint64_t m_id = 0; | |||
public: | |||
explicit Trace(StackManager::Node* top, uint64_t id): m_id{id} { | |||
int64_t nr_frames = top->depth() + 1; | |||
m_frames = SmallVector<StackManager::Frame>(nr_frames); | |||
StackManager::Node* node = top; | |||
for (int64_t i = 0; i < nr_frames; ++i) { | |||
m_frames[m_frames.size()-1-i] = {node, node->version()}; | |||
node = node->parent(); | |||
} | |||
mgb_assert(node->is_root() , ""); | |||
} | |||
Trace() = default; | |||
std::string to_string() const { | |||
std::string buffer; | |||
for (auto&& [node, version]: m_frames) { | |||
if (!buffer.empty()) { | |||
buffer.append("."); | |||
} | |||
buffer.append(node->name()); | |||
if (version != 0) { | |||
buffer.append(ssprintf("[%zu]", version)); | |||
} | |||
} | |||
return buffer; | |||
} | |||
const SmallVector<StackManager::Frame>& frames() const { | |||
return m_frames; | |||
} | |||
uint64_t id() const { | |||
return m_id; | |||
} | |||
}; | |||
inline StackManager::StackManager() { | |||
m_root = Node::make(); | |||
m_current = m_root.get(); | |||
} | |||
inline std::pair<StackManager::Node*, uint64_t> StackManager::enter(std::string name) { | |||
m_current = (*m_current)[name]; | |||
m_trace_id_stack.push_back(++m_last_trace_id); | |||
return {m_current, m_current->version()}; | |||
} | |||
inline void StackManager::exit(std::string name) { | |||
mgb_assert(m_current->name() == name, "scope name mismatch"); | |||
m_current = m_current->parent(); | |||
m_trace_id_stack.pop_back(); | |||
m_current->update_version(); | |||
} | |||
inline StackManager::Trace StackManager::dump() { | |||
return Trace(m_current, m_trace_id_stack.empty() ? 0 : m_trace_id_stack.back()); | |||
} | |||
inline StackManager::Node* StackManager::current() { | |||
return m_current; | |||
} | |||
} |
@@ -102,7 +102,8 @@ std::vector<std::pair<const char*, std::string>> props(const OpDef& def) { | |||
} | |||
std::string make_name(const OpDef& def) { | |||
return "OprAttr"; | |||
auto&& attr = def.cast_final_safe<OprAttr>(); | |||
return attr.type; | |||
} | |||
OP_TRAIT_REG(OprAttr, OprAttr) | |||
@@ -27,25 +27,11 @@ | |||
namespace mgb { | |||
namespace imperative { | |||
uint64_t Timer::get_nsecs() { | |||
using namespace std::chrono; | |||
auto finish = steady_clock::now(); | |||
auto duration = duration_cast<nanoseconds>(finish - m_start); | |||
return duration.count(); | |||
profiler::Time Timer::record_host() { | |||
return std::chrono::high_resolution_clock::now(); | |||
} | |||
uint64_t Timer::get_started_at() { | |||
return m_started_at; | |||
} | |||
void Timer::reset() { | |||
using namespace std::chrono; | |||
m_start = steady_clock::now(); | |||
auto now_ns = duration_cast<nanoseconds>(std::chrono::system_clock::now().time_since_epoch()); | |||
m_started_at = now_ns.count(); | |||
} | |||
std::shared_ptr<CompNode::Event> Timer::record_event(CompNode device) { | |||
std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) { | |||
auto event = EventPool::with_timer().alloc_shared(device); | |||
event->record(); | |||
return event; | |||
@@ -55,13 +41,13 @@ Profiler::options_t Profiler::sm_profile_options; | |||
std::mutex Profiler::sm_mutex; | |||
std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers; | |||
Timer Profiler::sm_timer; | |||
profiler::HostTime Profiler::sm_start_at; | |||
std::atomic_uint64_t Profiler::sm_last_id = 0; | |||
bool Profiler::sm_profiling = false; | |||
thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>(); | |||
std::atomic_size_t Profiler::sm_preferred_capacity; | |||
auto Profiler::get_thread_dict() -> thread_dict_t { | |||
MGB_LOCK_GUARD(sm_mutex); | |||
thread_dict_t thread_dict; | |||
for (auto&& [tid, profiler]: sm_profilers) { | |||
thread_dict[tid] = profiler->m_thread_name; | |||
@@ -69,15 +55,13 @@ auto Profiler::get_thread_dict() -> thread_dict_t { | |||
return thread_dict; | |||
} | |||
void Profiler::dump_profile(std::string basename, std::string format, results_t results, options_t options) { | |||
auto thread_dict = get_thread_dict(); | |||
if (format == "chrome_timeline.json") { | |||
profiler::dump_chrome_timeline(basename, options, thread_dict, results); | |||
} else if (format == "memory_flow.svg") { | |||
profiler::dump_memory_flow(basename, options, thread_dict, results); | |||
} else { | |||
void Profiler::dump_profile(std::string basename, std::string format, bundle_t result) { | |||
std::unordered_map<std::string, void(*)(std::string, bundle_t)> format_table; | |||
auto iter = format_table.find(format); | |||
if (iter == format_table.end()) { | |||
mgb_log_error("unsupported profiling format %s", format.c_str()); | |||
} | |||
return (iter->second)(basename, result); | |||
} | |||
} // namespace imperative | |||
@@ -17,6 +17,10 @@ | |||
#error Unsupported platform | |||
#endif | |||
#include "nlohmann/json.hpp" | |||
#include "megbrain/utils/debug.h" | |||
#include "./formats.h" | |||
#include "./states.h" | |||
@@ -53,8 +57,9 @@ public: | |||
m_idx = idx; | |||
return *this; | |||
} | |||
ChromeTraceEvent& ts(uint64_t ts) { | |||
m_ts = ts; | |||
template <typename TDuration> | |||
ChromeTraceEvent& ts(TDuration ts) { | |||
m_ts = std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(ts).count(); | |||
return *this; | |||
} | |||
ChromeTraceEvent& dur(uint64_t dur) { | |||
@@ -69,51 +74,46 @@ public: | |||
m_bp = bp; | |||
return *this; | |||
} | |||
ChromeTraceEvent& args(std::shared_ptr<json::Object> args) { | |||
ChromeTraceEvent& args(nlohmann::json args) { | |||
m_args = std::move(args); | |||
return *this; | |||
} | |||
ChromeTraceEvent& arg(std::string key, std::string value) { | |||
if (!m_args) { | |||
m_args = json::Object::make(); | |||
} | |||
(*m_args)[key] = json::String::make(value); | |||
m_args[key] = value; | |||
return *this; | |||
} | |||
ChromeTraceEvent& arg(std::string key, double value) { | |||
if (!m_args) { | |||
m_args = json::Object::make(); | |||
} | |||
(*m_args)[key] = json::Number::make(value); | |||
m_args[key] = value; | |||
return *this; | |||
} | |||
ChromeTraceEvent& arg(std::string key, std::shared_ptr<json::Value> value) { | |||
if (!m_args) { | |||
m_args = json::Object::make(); | |||
} | |||
(*m_args)[key] = value; | |||
ChromeTraceEvent& arg(std::string key, nlohmann::json value) { | |||
m_args[key] = value; | |||
return *this; | |||
} | |||
ChromeTraceEvent& stack(Trace trace) { | |||
m_stack = std::move(trace); | |||
return *this; | |||
} | |||
std::shared_ptr<json::Object> to_json() const { | |||
auto result = json::Object::make(); | |||
nlohmann::json to_json() const { | |||
nlohmann::json result; | |||
auto prop_str = [&](auto key, auto value) { | |||
if (value.empty()) { | |||
return; | |||
} | |||
(*result)[key] = json::String::make(value); | |||
result[key] = value; | |||
}; | |||
auto prop_num = [&](auto key, auto value) { | |||
if (!value) { | |||
return; | |||
} | |||
(*result)[key] = json::Number::make(value.value()); | |||
result[key] = value.value(); | |||
}; | |||
auto prop_char = [&](auto key, auto value) { | |||
if (!value) { | |||
return; | |||
} | |||
(*result)[key] = json::String::make(std::string{} + value.value()); | |||
result[key] = std::string{} + value.value(); | |||
}; | |||
prop_str("name", m_name); | |||
prop_str("cat", m_cat); | |||
@@ -126,8 +126,16 @@ public: | |||
prop_num("dur", m_dur); | |||
prop_char("ph", m_ph); | |||
prop_char("bp", m_bp); | |||
if (m_args) { | |||
(*result)["args"] = m_args; | |||
if (!m_args.empty()) { | |||
result["args"] = m_args; | |||
} | |||
if (m_stack) { | |||
nlohmann::json stack; | |||
for (auto&& frame: m_stack->frames()) { | |||
stack.push_back(ssprintf("%s%ld", frame.node->name().c_str(), frame.version)); | |||
} | |||
std::reverse(stack.begin(), stack.end()); | |||
result["stack"] = stack; | |||
} | |||
return result; | |||
} | |||
@@ -140,11 +148,12 @@ private: | |||
std::optional<uint64_t> m_pid; | |||
std::optional<uint64_t> m_id; | |||
std::optional<uint64_t> m_idx; | |||
std::optional<uint64_t> m_ts; | |||
std::optional<double> m_ts; | |||
std::optional<uint64_t> m_dur; | |||
std::optional<char> m_ph; | |||
std::optional<char> m_bp; | |||
std::shared_ptr<json::Object> m_args; | |||
nlohmann::json m_args; | |||
std::optional<Trace> m_stack; | |||
}; | |||
class ChromeTraceEvents { | |||
@@ -154,368 +163,218 @@ public: | |||
return m_content.back(); | |||
} | |||
std::shared_ptr<json::Value> to_json() const { | |||
auto result = json::Object::make(); | |||
auto event_list = json::Array::make(); | |||
std::string metadata(std::string key) { | |||
return m_metadata[key]; | |||
} | |||
nlohmann::json to_json() const { | |||
nlohmann::json result; | |||
nlohmann::json event_list; | |||
nlohmann::json metadata; | |||
for (auto&& event: m_content) { | |||
event_list->add(event.to_json()); | |||
event_list.push_back(event.to_json()); | |||
} | |||
for (auto&& [key, value]: m_metadata) { | |||
metadata[key] = value; | |||
} | |||
(*result)["traceEvents"] = event_list; | |||
result["traceEvents"] = event_list; | |||
result["metadata"] = metadata; | |||
return result; | |||
} | |||
std::string to_string() const { | |||
auto json = to_json(); | |||
return "{" "traceEvents:" + nlohmann::to_string(json["traceEvents"]) + "," | |||
"metadata:" + nlohmann::to_string(json["metadata"]) + "}"; | |||
} | |||
private: | |||
std::vector<ChromeTraceEvent> m_content; | |||
std::unordered_map<std::string, std::string> m_metadata; | |||
}; | |||
void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results){ | |||
auto pid = getpid(); | |||
ProfileDataCollector collector; | |||
ProfileState state; | |||
#define HANDLE_EVENT(type, ...) \ | |||
collector.handle<type>([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ ); | |||
struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||
ChromeTraceEvents trace_events; | |||
decltype(getpid()) pid = getpid(); | |||
std::string pid_str = std::to_string(pid); | |||
#define NEW_HOST(NAME, PH) trace_events.new_event().name(NAME).pid(pid).tid(state[tid].index).ph(PH).ts((double)time/1e3) | |||
#define NEW_DEVICE(NAME, PH) trace_events.new_event().name(NAME).pid(pid).tid(256+state[event.event->comp_node()].index).ph(PH).ts((double)get_device_time(event.event, time)/1e3) | |||
#define OP_NAME op_state.name | |||
#define OP_KERNEL_NAME (op_state.name + "") | |||
#define OP_PROPS get_op_args(op_state) | |||
#define OP_ID event.op_id | |||
#define TENSOR_PROPS get_tensor_args(tensor_state, time) | |||
#define TENSOR_INFO get_tensor_info(tensor_state, time) | |||
#define TENSOR_COMMAND_KIND print_tensor_command_kind(event.kind) | |||
#define HANDLE_PLAIN_EVENT(START, FINISH, NAME_EXPR)\ | |||
HANDLE_EVENT(START, { NEW_HOST(NAME_EXPR, 'B'); })\ | |||
HANDLE_EVENT(FINISH, { NEW_HOST(NAME_EXPR, 'E'); }) | |||
#define HANDLE_TENSOR_EVENT(START, FINISH, NAME_EXPR)\ | |||
HANDLE_EVENT(START, { NEW_HOST(NAME_EXPR, 'B'); })\ | |||
HANDLE_EVENT(FINISH, { auto& tensor_state = state.tensors[event.tensor_id]; NEW_HOST(NAME_EXPR, 'E').args(TENSOR_PROPS); }) | |||
#define INC_COUNTER(NAME, DELTA)\ | |||
{ state.statics.NAME += DELTA; NEW_HOST(#NAME, 'C').arg(#NAME, state.statics.NAME); } | |||
auto get_tensor_args = [](const ProfileTensorState& tensor, uint64_t time) -> std::shared_ptr<json::Object> { | |||
auto args = json::Object::make(); | |||
(*args)["id"] = json::Number::make(tensor.id); | |||
(*args)["name"] = json::String::make(tensor.name); | |||
(*args)["shape"] = json::String::make(tensor.layout.TensorShape::to_string()); | |||
(*args)["dtype"] = json::String::make(tensor.layout.dtype.name()); | |||
(*args)["nr_elements"] = json::Number::make(tensor.layout.total_nr_elems()); | |||
(*args)["device"] = json::String::make(tensor.device.to_string()); | |||
if (tensor.produced) { | |||
(*args)["living_time"] = json::String::make(std::to_string((time - tensor.produced + tensor.living_time)/1e6) + "ms"); | |||
} | |||
return args; | |||
}; | |||
auto get_tensor_info = [](const ProfileTensorState& tensor, uint64_t time) -> std::string { | |||
std::string name = tensor.name; | |||
std::string shape = tensor.layout.TensorShape::to_string(); | |||
std::string size_in_bytes = std::to_string(tensor.size_in_bytes()); | |||
std::string device = tensor.device.to_string(); | |||
std::string dtype = tensor.layout.dtype.name(); | |||
return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.c_str()); | |||
}; | |||
ChromeTraceEvent& new_event(std::string name, char ph, size_t tid, profiler::HostTime time) { | |||
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts(since_start(time)); | |||
} | |||
auto get_op_args = [&](const ProfileOperatorState& op) -> std::shared_ptr<json::Object> { | |||
auto args = json::Object::make(); | |||
auto params = op.params; | |||
for (auto&& [name, value]: params) { | |||
(*args)[name] = json::String::make(value); | |||
} | |||
(*args)["__id__"] = json::Number::make(op.id); | |||
(*args)["__name__"] = json::String::make(op.name); | |||
(*args)["__device__"] = json::String::make(op.device.to_string()); | |||
return args; | |||
ChromeTraceEvent& new_host_event(std::string name, char ph) { | |||
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time)); | |||
}; | |||
auto get_device_time = [&](const std::shared_ptr<CompNode::Event>& event, uint64_t host) -> uint64_t { | |||
event->host_wait(); | |||
auto& device_state = state.devices[event->comp_node()]; | |||
if (!device_state.base_event) { | |||
device_state.base_event = event; | |||
device_state.base_time = host; | |||
return host; | |||
} | |||
uint64_t device = device_state.base_event->elapsed_time_until(*event) * 1e9 + device_state.base_time; | |||
return std::max(device, host); | |||
ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | |||
using namespace std::literals::chrono_literals; | |||
auto time = since_start(to_device_time(current->time, device)); | |||
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time); | |||
}; | |||
auto print_tensor_command_kind = [&](int kind) -> const char* { | |||
const char* to_cstr(TensorCommandKind kind) { | |||
switch(kind) { | |||
case TensorCommandEvent::Put: | |||
case TensorCommandKind::Put: | |||
return "Put"; | |||
case TensorCommandEvent::Drop: | |||
case TensorCommandKind::Drop: | |||
return "Drop"; | |||
case TensorCommandEvent::Del: | |||
case TensorCommandKind::Del: | |||
return "Del"; | |||
case TensorCommandEvent::SwapIn: | |||
case TensorCommandKind::SwapIn: | |||
return "SwapIn"; | |||
case TensorCommandEvent::SwapOut: | |||
case TensorCommandKind::SwapOut: | |||
return "SwapOut"; | |||
case TensorCommandEvent::RecFree: | |||
case TensorCommandKind::RecFree: | |||
return "RecFree"; | |||
case TensorCommandEvent::ReGen: | |||
case TensorCommandKind::ReGen: | |||
return "ReGen"; | |||
case TensorCommandKind::GetValue: | |||
return "GetValue"; | |||
} | |||
return "UnknownCommand"; | |||
}; | |||
HANDLE_EVENT(OpDispatchEvent, { | |||
auto& op_state = state.operators[OP_ID] = {}; | |||
op_state.id = OP_ID; | |||
op_state.name = event.op_name; | |||
op_state.params = event.op_params(); | |||
op_state.inputs = event.inputs; | |||
op_state.outputs = event.outputs; | |||
NEW_HOST("OpDispatch", 'B'); | |||
NEW_HOST(ssprintf("%d", pid), 's') | |||
.cat("OpDispatch") | |||
.id(OP_ID) | |||
.scope(std::to_string(pid)); | |||
NEW_HOST("OpDispatch", 'E').args(OP_PROPS); | |||
INC_COUNTER(op_enqueue_count, 1); | |||
}); | |||
HANDLE_EVENT(OpExecuteEvent, { | |||
mgb_assert(OP_ID != 0); | |||
mgb_assert(state.operators.count(OP_ID) > 0); | |||
auto& op_state = state.operators[OP_ID]; | |||
op_state.host_begin = time; | |||
NEW_HOST(OP_NAME, 'B'); | |||
//.args(OP_PROPS); | |||
NEW_HOST(ssprintf("%d", pid), 't') | |||
.cat("OpDispatch") | |||
.id(OP_ID) | |||
.scope(std::to_string(pid)); | |||
INC_COUNTER(op_execute_count, 1); | |||
}); | |||
HANDLE_EVENT(OpExecuteFinishEvent, { | |||
auto& op_state = state.operators[event.op_id]; | |||
op_state.host_end = time; | |||
NEW_HOST(OP_NAME, 'E') | |||
.args(OP_PROPS); | |||
}); | |||
HANDLE_EVENT(KernelExecuteEvent, { | |||
auto& op_state = state.operators[event.op_id]; | |||
op_state.device_begin = event.event; | |||
NEW_HOST(ssprintf("%d", pid), 's') | |||
.id(event.kernel_id) | |||
.cat("KernelLaunch") | |||
.scope(std::to_string(pid)); | |||
NEW_DEVICE(OP_KERNEL_NAME, 'B') | |||
.cat("Kernel"); | |||
//.args(OP_PROPS); | |||
NEW_DEVICE(ssprintf("%d", pid), 'f') | |||
.id(event.kernel_id) | |||
.bp('e') | |||
.cat("KernelLaunch") | |||
.scope(std::to_string(pid)); | |||
}); | |||
HANDLE_EVENT(KernelExecuteFinishEvent, { | |||
auto& op_state = state.operators[event.op_id]; | |||
op_state.device_end = event.event; | |||
NEW_DEVICE(OP_KERNEL_NAME, 'E') | |||
.cat("Kernel") | |||
.args(OP_PROPS); | |||
}); | |||
HANDLE_EVENT(TensorDeclareEvent, { | |||
auto& tensor_state = state.tensors[event.tensor_id] = {}; | |||
tensor_state.id = event.tensor_id; | |||
tensor_state.name = event.name; | |||
}); | |||
HANDLE_EVENT(TensorProduceEvent, { | |||
auto& tensor_state = state.tensors[event.tensor_id]; | |||
tensor_state.device = event.device; | |||
tensor_state.layout = event.layout; | |||
tensor_state.produced = time; | |||
if (!tensor_state.living_time) { | |||
NEW_HOST(ssprintf("%d", pid), 's') | |||
.id(event.tensor_id) | |||
.cat("TensorLink") | |||
.scope(std::to_string(pid)); | |||
} else { | |||
NEW_HOST(ssprintf("%d", pid), 't') | |||
.id(event.tensor_id) | |||
.cat("TensorLink") | |||
.scope(std::to_string(pid)); | |||
} | |||
INC_COUNTER(alive_tensor_count, 1); | |||
INC_COUNTER(produce_tensor_count, 1); | |||
state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()}); | |||
state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced}); | |||
}); | |||
HANDLE_EVENT(TensorUsageEvent, { | |||
NEW_HOST(ssprintf("%d", pid), 't') | |||
.id(event.tensor_id) | |||
.cat("TensorLink") | |||
.scope(std::to_string(pid)); | |||
}); | |||
HANDLE_EVENT(TensorReleaseEvent, { | |||
auto& tensor_state = state.tensors[event.tensor_id]; | |||
tensor_state.living_time += time - tensor_state.produced; | |||
tensor_state.produced = 0; | |||
INC_COUNTER(alive_tensor_count, -1); | |||
INC_COUNTER(erase_tensor_count, 1); | |||
state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()}); | |||
state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced}); | |||
NEW_HOST(ssprintf("%d", pid), 't') | |||
.id(event.tensor_id) | |||
.cat("TensorLink") | |||
.scope(std::to_string(pid)); | |||
}); | |||
HANDLE_EVENT(TensorEraseEvent, { | |||
auto& tensor_state = state.tensors[event.tensor_id]; | |||
if (tensor_state.living_time) { | |||
NEW_HOST(ssprintf("%d", pid), 'f') | |||
.id(event.tensor_id) | |||
.bp('e') | |||
.cat("TensorLink") | |||
.scope(std::to_string(pid)); | |||
} | |||
if (event.use_count == 0) { | |||
INC_COUNTER(redundant_tensor_count, 1); | |||
} | |||
}); | |||
HANDLE_EVENT(TensorGetPropEvent, { | |||
auto& tensor_state = state.tensors[event.tensor_id]; | |||
NEW_HOST("TensorGetProp", 'X') | |||
.dur(0).args(TENSOR_PROPS); | |||
}); | |||
HANDLE_EVENT(TensorWaitPropEvent, { | |||
NEW_HOST("TensorWaitProp", 'B'); | |||
if (event.prop == TensorProp::HostValue) { | |||
INC_COUNTER(wait_value_count, 1); | |||
} else if (event.prop == TensorProp::Shape) { | |||
INC_COUNTER(wait_shape_count, 1); | |||
} | |||
INC_COUNTER(wait_prop_count, 1); | |||
}); | |||
HANDLE_EVENT(TensorWaitPropFinishEvent, { | |||
auto& tensor_state = state.tensors[event.tensor_id]; | |||
if (event.notified) { | |||
NEW_HOST(ssprintf("%d", pid), 'f') | |||
.id(event.tensor_id) | |||
.bp('e') | |||
.cat("TensorProp") | |||
.scope(std::to_string(pid)); | |||
} | |||
NEW_HOST("TensorWaitProp", 'E') | |||
.args(TENSOR_PROPS); | |||
}); | |||
HANDLE_EVENT(TensorNotifyPropEvent, { | |||
NEW_HOST(ssprintf("%d", pid), 's') | |||
.id(event.tensor_id) | |||
.cat("TensorProp") | |||
.scope(std::to_string(pid)); | |||
}); | |||
HANDLE_EVENT(ShapeInferEvent, { | |||
if (event.success) { | |||
INC_COUNTER(infer_shape_valid_count, 1); | |||
} else { | |||
INC_COUNTER(infer_shape_invalid_count, 1); | |||
} | |||
}); | |||
HANDLE_EVENT(SampleDeviceEvent, { | |||
NEW_HOST("TopKTensor", 'B'); | |||
}); | |||
HANDLE_EVENT(SampleDeviceFinishEvent, { | |||
std::string device_name = event.device.locator().to_string(); | |||
std::string prop_name = ssprintf("%s_alloc_memory", device_name.c_str()); | |||
NEW_HOST(prop_name, 'C') | |||
.arg(prop_name, event.total_memory - event.free_memory); | |||
auto top_k_tensors = state.top_k_tensor_in_device(event.device, options.at("num_tensor_watch")); | |||
auto& top_k_event = NEW_HOST("TopKTensor", 'E'); | |||
for (size_t i = 0; i < top_k_tensors.size(); ++i) { | |||
auto tensor_id = top_k_tensors[i]; | |||
auto& tensor_state = state.tensors[tensor_id]; | |||
top_k_event.arg(ssprintf("top%03d", (int)i), TENSOR_INFO); //%03d is always enough | |||
} | |||
}); | |||
HANDLE_EVENT(WorkerExceptionEvent, { | |||
INC_COUNTER(exception_count, 1); | |||
}); | |||
HANDLE_EVENT(TensorCommandEvent, { | |||
NEW_HOST(ssprintf("%s %zu", TENSOR_COMMAND_KIND, event.tensor_id), 'B'); | |||
}); | |||
HANDLE_EVENT(TensorCommandFinishEvent, { | |||
auto& tensor_state = state.tensors[event.tensor_id]; | |||
NEW_HOST(ssprintf("%s %zu", TENSOR_COMMAND_KIND, event.tensor_id), 'E') | |||
.args(TENSOR_PROPS); | |||
}); | |||
HANDLE_EVENT(ScopeEvent, { | |||
NEW_HOST(event.name, 'B'); | |||
state.threads[tid].scope_stack.push_back(event.name); | |||
}); | |||
HANDLE_EVENT(ScopeFinishEvent, { | |||
NEW_HOST(event.name, 'E'); | |||
mgb_assert(state.threads[tid].scope_stack.back() == event.name); | |||
state.threads[tid].scope_stack.pop_back(); | |||
}); | |||
HANDLE_TENSOR_EVENT(OpInputEvent, OpInputFinishEvent, ssprintf("Input %zu", event.tensor_id)); | |||
HANDLE_TENSOR_EVENT(OpOutputEvent, OpOutputFinishEvent, ssprintf("Output %zu", event.tensor_id)); | |||
HANDLE_TENSOR_EVENT(OpDelEvent, OpDelFinishEvent, ssprintf("Del %zu", event.tensor_id)); | |||
HANDLE_PLAIN_EVENT(StartProfileEvent, StartProfileFinishEvent, "StartProfile"); | |||
HANDLE_PLAIN_EVENT(StopProfileEvent, StopProfileFinishEvent, "StopProfile"); | |||
HANDLE_PLAIN_EVENT(CustomEvent, CustomFinishEvent, event.title); | |||
HANDLE_PLAIN_EVENT(AutoEvictEvent, AutoEvictFinishEvent, "AutoEvict"); | |||
if (results.size() > 0) { | |||
uint64_t time = results[0].second.time; | |||
trace_events.new_event().name("Metadata").ph('I').pid(pid).ts(0).arg("localTime", time/1e3); | |||
} | |||
for (auto&& result: results) { | |||
collector(result.second.id, result.first, result.second.time, result.second.data); | |||
} | |||
for (auto&& [tid, thread]: state.threads) { | |||
if (!thread_dict.count(tid)) { | |||
continue; | |||
template <typename TEvent> | |||
void visit_event(const TEvent &event) { | |||
if constexpr (std::is_same_v<TEvent, OpDispatchEvent>) { | |||
new_host_event("OpDispatch", 'B'); | |||
new_host_event(pid_str, 's') | |||
.cat("OpDispatch") | |||
.id(event.op_id) | |||
.scope(pid_str); | |||
new_host_event("OpDispatch", 'E').args(current_op->detail()); | |||
} else if constexpr (std::is_same_v<TEvent, OpExecuteEvent>) { | |||
mgb_assert(event.op_id != 0); | |||
current_op->execute_begin = current->time; | |||
new_host_event(current_op->name, 'B'); | |||
new_host_event(pid_str, 't') | |||
.cat("OpDispatch") | |||
.id(current_op->id) | |||
.scope(pid_str); | |||
} else if constexpr (std::is_same_v<TEvent, OpExecuteFinishEvent>) { | |||
current_op->execute_end = current->time; | |||
new_host_event(current_op->name, 'E') | |||
.args(current_op->detail()); | |||
} else if constexpr (std::is_same_v<TEvent, KernelLaunchEvent>) { | |||
new_host_event(pid_str, 's') | |||
.id(event.kernel_id) | |||
.cat("KernelLaunch") | |||
.scope(pid_str); | |||
new_device_event(current_op->name, 'B', event.device) | |||
.cat("Kernel"); | |||
new_device_event(pid_str, 'f', event.device) | |||
.id(event.kernel_id) | |||
.bp('e') | |||
.cat("KernelLaunch") | |||
.scope(pid_str); | |||
} else if constexpr (std::is_same_v<TEvent, KernelLaunchFinishEvent>) { | |||
new_device_event(current_op->name, 'E', event.device) | |||
.cat("Kernel") | |||
.args(current_op->detail()); | |||
} else if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) { | |||
if (current_tensor->living_time != profiler::Duration::zero()) { | |||
new_host_event(pid_str, 's') | |||
.id(event.tensor_id) | |||
.cat("TensorLink") | |||
.scope(pid_str); | |||
} else { | |||
new_host_event(pid_str, 't') | |||
.id(event.tensor_id) | |||
.cat("TensorLink") | |||
.scope(pid_str); | |||
} | |||
} else if constexpr (std::is_same_v<TEvent, TensorUsageEvent>) { | |||
new_host_event(pid_str, 't') | |||
.id(event.tensor_id) | |||
.cat("TensorLink") | |||
.scope(pid_str); | |||
} else if constexpr (std::is_same_v<TEvent, TensorReleaseEvent>) { | |||
current_tensor->living_time += current->time - current_tensor->produced; | |||
current_tensor->produced = {}; | |||
new_host_event(pid_str, 't') | |||
.id(event.tensor_id) | |||
.cat("TensorLink") | |||
.scope(pid_str); | |||
} else if constexpr (std::is_same_v<TEvent, TensorEraseEvent>) { | |||
if (current_tensor->living_time != profiler::Duration::zero()) { | |||
new_host_event(pid_str, 'f') | |||
.id(event.tensor_id) | |||
.bp('e') | |||
.cat("TensorLink") | |||
.scope(pid_str); | |||
} | |||
} else if constexpr (std::is_same_v<TEvent, TensorGetPropEvent>) { | |||
new_host_event("TensorGetProp", 'X') | |||
.dur(0).args(current_tensor->detail(current->time)); | |||
} else if constexpr (std::is_same_v<TEvent, TensorWaitPropFinishEvent>) { | |||
new_host_event(pid_str, 'f') | |||
.id(event.tensor_id) | |||
.bp('e') | |||
.cat("TensorProp") | |||
.scope(pid_str); | |||
new_host_event("TensorWaitProp", 'E') | |||
.args(current_tensor->detail(current->time)); | |||
} else if constexpr (std::is_same_v<TEvent, TensorNotifyPropEvent>) { | |||
new_host_event(pid_str, 's') | |||
.id(event.tensor_id) | |||
.cat("TensorProp") | |||
.scope(pid_str); | |||
} else if constexpr (std::is_same_v<TEvent, SampleDeviceFinishEvent>) { | |||
std::string device_name = event.device.locator().to_string(); | |||
new_host_event("memory", 'C') | |||
.arg(ssprintf("%s_alloc_mem", device_name.c_str()), event.total_memory - event.free_memory); | |||
} else if constexpr (std::is_same_v<TEvent, TensorCommandEvent>) { | |||
new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B'); | |||
} else if constexpr (std::is_same_v<TEvent, TensorCommandFinishEvent>) { | |||
new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'E') | |||
.args(current_tensor->detail(current->time)); | |||
} else if constexpr (std::is_same_v<TEvent, ScopeEvent>) { | |||
new_host_event(event.name, 'B'); | |||
} else if constexpr (std::is_same_v<TEvent, ScopeFinishEvent>) { | |||
new_host_event(event.name, 'E'); | |||
} else if constexpr (std::is_same_v<TEvent, OpInputEvent>) { | |||
new_host_event(ssprintf("Input %zu", event.tensor_id), 'B') | |||
.args(current_tensor->detail(current->time)); | |||
} else if constexpr (std::is_same_v<TEvent, OpInputFinishEvent>) { | |||
new_host_event(ssprintf("Input %zu", event.tensor_id), 'E') | |||
.args(current_tensor->detail(current->time)); | |||
} else if constexpr (std::is_same_v<TEvent, OpOutputEvent>) { | |||
new_host_event(ssprintf("Output %zu", event.tensor_id), 'B') | |||
.args(current_tensor->detail(current->time)); | |||
} else if constexpr (std::is_same_v<TEvent, OpOutputFinishEvent>) { | |||
new_host_event(ssprintf("Output %zu", event.tensor_id), 'E') | |||
.args(current_tensor->detail(current->time)); | |||
} else if constexpr (std::is_same_v<TEvent, StartProfileEvent>) { | |||
new_host_event("StartProfile", 'B'); | |||
} else if constexpr (std::is_same_v<TEvent, StartProfileFinishEvent>) { | |||
new_host_event("StartProfile", 'E'); | |||
} else if constexpr (std::is_same_v<TEvent, StopProfileEvent>) { | |||
new_host_event("StopProfile", 'B'); | |||
} else if constexpr (std::is_same_v<TEvent, StopProfileFinishEvent>) { | |||
new_host_event("StopProfile", 'E'); | |||
} else if constexpr (std::is_same_v<TEvent, CustomEvent>) { | |||
new_host_event(event.title, 'B'); | |||
} else if constexpr (std::is_same_v<TEvent, CustomFinishEvent>) { | |||
new_host_event(event.title, 'E'); | |||
} else if constexpr (std::is_same_v<TEvent, AutoEvictEvent>) { | |||
new_host_event("AutoEvict", 'B'); | |||
} else if constexpr (std::is_same_v<TEvent, AutoEvictFinishEvent>) { | |||
new_host_event("AutoEvict", 'E'); | |||
} | |||
trace_events.new_event().ts(0).name("thread_name").ph('M').pid(pid).tid(thread.index).arg("name", thread_dict[tid]); | |||
} | |||
for (auto&& [device, device_state]: state.devices) { | |||
trace_events.new_event().ts(0).name("thread_name").ph('M').pid(pid).tid(256+device_state.index).arg("name", device.to_string()); | |||
void notify_counter(std::string key, int64_t old_val, int64_t new_val) { | |||
new_host_event(key, 'C').arg("value", new_val); | |||
} | |||
}; | |||
trace_events.to_json()->writeto_fpath(filename); | |||
void dump_chrome_timeline(std::string filename, Profiler::bundle_t result){ | |||
ChromeTimelineEventVisitor visitor; | |||
visitor.process_events(result); | |||
visitor.trace_events.metadata("localTime") = std::to_string(result.start_at.time_since_epoch().count()); | |||
std::string json_repr = visitor.trace_events.to_string(); | |||
mgb::debug::write_to_file(filename.c_str(), json_repr); | |||
} | |||
} |
@@ -12,7 +12,9 @@ | |||
#pragma once | |||
#include "megbrain/utils/small_vector.h" | |||
#include "megbrain/imperative/profiler.h" | |||
#include "../interpreter/stack_manager.h" | |||
#include "../op_trait.h" | |||
namespace mgb::imperative::profiler { | |||
@@ -52,6 +54,11 @@ struct ToStringTrait<profiler::TensorProp>{ | |||
namespace mgb::imperative::profiler { | |||
using Trace = interpreter::intl::StackManager::Trace; | |||
struct ProfileOperatorState; | |||
struct ProfileTensorState; | |||
#define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__; | |||
#define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__; | |||
@@ -61,6 +68,7 @@ DEF_EVENT(OpDispatch, { | |||
std::function<OpParams()> op_params; | |||
SmallVector<uint64_t> inputs; | |||
SmallVector<uint64_t> outputs; | |||
Trace trace; | |||
}); | |||
DEF_DUR_EVENT(OpInput, { | |||
@@ -68,11 +76,6 @@ DEF_DUR_EVENT(OpInput, { | |||
TensorShape shape; | |||
}); | |||
DEF_DUR_EVENT(OpDel, { | |||
uint64_t tensor_id; | |||
TensorShape shape; | |||
}); | |||
DEF_DUR_EVENT(OpOutput, { | |||
uint64_t tensor_id; | |||
TensorShape shape; | |||
@@ -80,16 +83,13 @@ DEF_DUR_EVENT(OpOutput, { | |||
DEF_DUR_EVENT(OpExecute, { | |||
uint64_t op_id; | |||
SmallVector<CompNode> device_list; | |||
}); | |||
DEF_DUR_EVENT(OpPostExecute, { | |||
uint64_t op_id; | |||
}); | |||
DEF_DUR_EVENT(KernelExecute, { | |||
DEF_DUR_EVENT(KernelLaunch, { | |||
uint64_t op_id; | |||
uint64_t kernel_id; | |||
std::shared_ptr<CompNode::Event> event; | |||
CompNode device; | |||
}); | |||
DEF_EVENT(TensorDeclare, { | |||
@@ -128,19 +128,12 @@ DEF_EVENT(TensorNotifyProp, { | |||
TensorProp prop; | |||
}); | |||
DEF_EVENT(TensorWaitProp, { | |||
DEF_DUR_EVENT(TensorWaitProp, { | |||
uint64_t tensor_id; | |||
uint64_t wait_id; | |||
TensorProp prop; | |||
}); | |||
DEF_EVENT(TensorWaitPropFinish, { | |||
uint64_t tensor_id; | |||
uint64_t wait_id; | |||
TensorProp prop; | |||
bool notified; | |||
}); | |||
DEF_DUR_EVENT(SampleDevice, { | |||
CompNode device; | |||
size_t total_memory; | |||
@@ -157,13 +150,10 @@ DEF_DUR_EVENT(Scope, { | |||
std::string name; | |||
}); | |||
DEF_DUR_EVENT(DeviceScope, { | |||
std::string name; | |||
std::shared_ptr<CompNode::Event> event; | |||
DEF_DUR_EVENT(Sync, { | |||
Trace trace; | |||
}); | |||
DEF_DUR_EVENT(Sync, {}); | |||
DEF_DUR_EVENT(StartProfile, { | |||
size_t capture_count; | |||
}); | |||
@@ -172,10 +162,13 @@ DEF_DUR_EVENT(StopProfile, { | |||
size_t escape_count; | |||
}); | |||
enum class TensorCommandKind { | |||
Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue | |||
}; | |||
DEF_DUR_EVENT(TensorCommand, { | |||
enum Kind { | |||
Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue | |||
}; | |||
using Kind = TensorCommandKind; | |||
uint64_t tensor_id; | |||
Kind kind; | |||
}); | |||
@@ -187,6 +180,17 @@ DEF_DUR_EVENT(Custom, { | |||
std::string content; | |||
}); | |||
DEF_EVENT(RecordDevice, { | |||
std::shared_ptr<CompNode::Event> event; | |||
}); | |||
DEF_DUR_EVENT(HostToDevice, { | |||
TensorLayout layout; | |||
CompNode device; | |||
void* host_ptr; | |||
void* device_ptr; | |||
}); | |||
#undef DEF_EVENT | |||
#undef DEF_DUR_EVENT | |||
@@ -15,10 +15,12 @@ | |||
#include "megbrain/imperative/profiler.h" | |||
#include "./states.h" | |||
namespace mgb::imperative::profiler { | |||
void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results); | |||
void dump_chrome_timeline(std::string filename, Profiler::bundle_t result); | |||
void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results); | |||
void dump_memory_flow(std::string filename, Profiler::bundle_t result); | |||
} |
@@ -89,7 +89,8 @@ struct MemoryChunk { | |||
std::array<uintptr_t, 2> address; | |||
std::string name; | |||
TensorLayout layout; | |||
std::array<uint64_t, 2> time; | |||
std::array<profiler::Duration, 2> time; | |||
std::optional<uint64_t> group; | |||
bool empty() const { | |||
return address[1] - address[0] == 0; | |||
@@ -111,9 +112,9 @@ struct MemoryFlow { | |||
return {addr_begin, addr_end}; | |||
} | |||
std::pair<uint64_t, uint64_t> time_range() const { | |||
auto time_begin = std::numeric_limits<uint64_t>::max(); | |||
auto time_end = std::numeric_limits<uint64_t>::min(); | |||
std::pair<profiler::Duration, profiler::Duration> time_range() const { | |||
auto time_begin = profiler::Duration::max(); | |||
auto time_end = profiler::Duration::min(); | |||
for(auto&& [id, chunk]: chunks) { | |||
MGB_MARK_USED_VAR(id); | |||
if (chunk.empty()) continue; | |||
@@ -123,27 +124,6 @@ struct MemoryFlow { | |||
return {time_begin, time_end}; | |||
} | |||
std::shared_ptr<json::Array> to_json() const { | |||
auto results = json::Array::make(); | |||
for(auto&& [id, chunk]: chunks) { | |||
MGB_MARK_USED_VAR(id); | |||
if (chunk.empty()) continue; | |||
auto address = json::Array::make(); | |||
auto time = json::Array::make(); | |||
address->add(json::String::make(std::to_string(chunk.address[0]))); | |||
address->add(json::String::make(std::to_string(chunk.address[1]))); | |||
time->add(json::String::make(std::to_string(chunk.time[0]))); | |||
time->add(json::String::make(std::to_string(chunk.time[1]))); | |||
results->add(json::Object::make({ | |||
{"address", address}, | |||
{"name", json::String::make(chunk.name)}, | |||
{"layout", json::String::make(chunk.layout.to_string())}, | |||
{"time", time} | |||
})); | |||
} | |||
return results; | |||
} | |||
XMLWriter to_svg() const { | |||
XMLWriter writer; | |||
auto&& [addr_begin, addr_end] = address_range(); | |||
@@ -157,13 +137,13 @@ struct MemoryFlow { | |||
svg.attr("xmlns:tag", std::string{"https://megengine.org.cn"}); | |||
double time_scale = 1e5; | |||
double addr_scale = 1e6; | |||
svg.attr("width", (time_end-time_begin)/time_scale); | |||
svg.attr("width", (time_end-time_begin).count()/time_scale); | |||
svg.attr("height", (addr_end-addr_begin)/addr_scale); | |||
{ | |||
auto rect = writer.element("rect"); | |||
rect.attr("x", 0); | |||
rect.attr("y", 0); | |||
rect.attr("width", (time_end-time_begin)/time_scale); | |||
rect.attr("width", (time_end-time_begin).count()/time_scale); | |||
rect.attr("height", (addr_end-addr_begin)/addr_scale); | |||
rect.attr("fill", std::string{"blue"}); | |||
} | |||
@@ -177,7 +157,7 @@ struct MemoryFlow { | |||
{1000 * ms, "#888888"}, | |||
{std::numeric_limits<double>::infinity(), "#555555"}, | |||
}; | |||
auto time2str = [](uint64_t ns){ | |||
auto time2str = [](profiler::Duration ns){ | |||
using pair_t = std::pair<uint64_t, const char*>; | |||
static pair_t units[] = { | |||
{1, "ns "}, | |||
@@ -189,9 +169,9 @@ struct MemoryFlow { | |||
auto comparator = [](const pair_t& lhs, const pair_t& rhs) { | |||
return lhs.first < rhs.first; | |||
}; | |||
while (ns > 0) { | |||
auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns, ""), comparator) - 1; | |||
builder += std::to_string(ns / iter->first) + iter->second; | |||
while (ns.count() > 0) { | |||
auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns.count(), ""), comparator) - 1; | |||
builder += std::to_string(ns.count() / iter->first) + iter->second; | |||
ns = ns % iter->first; | |||
} | |||
return builder; | |||
@@ -218,11 +198,11 @@ struct MemoryFlow { | |||
for (auto&& [id, chunk]: chunks) { | |||
MGB_MARK_USED_VAR(id); | |||
if (chunk.empty()) continue; | |||
double left = (chunk.time[0]-time_begin)/time_scale; | |||
double right = (chunk.time[1]-time_begin)/time_scale; | |||
double left = (chunk.time[0]-time_begin).count()/time_scale; | |||
double right = (chunk.time[1]-time_begin).count()/time_scale; | |||
double top = (chunk.address[0]-addr_begin)/addr_scale; | |||
double bottom = (chunk.address[1]-addr_begin)/addr_scale; | |||
double duration = chunk.time[1] - chunk.time[0]; | |||
double duration = (chunk.time[1] - chunk.time[0]).count(); | |||
{ | |||
auto rect = writer.element("rect"); | |||
rect.attr("x", left); | |||
@@ -241,70 +221,48 @@ struct MemoryFlow { | |||
mge_attr("produced", time2str(chunk.time[0])); | |||
mge_attr("erased", time2str(chunk.time[1])); | |||
mge_attr("duration", time2str(chunk.time[1] - chunk.time[0])); | |||
if (chunk.group) { | |||
mge_attr("group", std::to_string(*chunk.group)); | |||
} | |||
} | |||
} | |||
return writer; | |||
} | |||
}; | |||
void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results) { | |||
MemoryFlow flow; | |||
ProfileDataCollector collector; | |||
ProfileState state; | |||
#define HANDLE_EVENT(type, ...) \ | |||
collector.handle<type>([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ ); | |||
HANDLE_EVENT(TensorDeclareEvent, { | |||
auto& tensor_state = state.tensors[event.tensor_id] = {}; | |||
tensor_state.id = event.tensor_id; | |||
tensor_state.name = event.name; | |||
}); | |||
struct MemoryFlowVisitor: EventVisitor<MemoryFlowVisitor> { | |||
MemoryFlow memory_flow; | |||
HANDLE_EVENT(TensorProduceEvent, { | |||
auto& tensor_state = state.tensors[event.tensor_id]; | |||
tensor_state.device = event.device; | |||
tensor_state.layout = event.layout; | |||
tensor_state.produced = time; | |||
state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()}); | |||
state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced}); | |||
auto& chunk = flow.chunks[event.tensor_id]; | |||
uintptr_t address = reinterpret_cast<uintptr_t>(event.ptr); | |||
auto span = event.layout.span(); | |||
auto dtype = event.layout.dtype; | |||
// assume dtype is not lowbit | |||
if (!address) { | |||
chunk.address = {0, 0}; | |||
} else { | |||
chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()}; | |||
template <typename TEvent> | |||
void visit_event(const TEvent &event) { | |||
if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) { | |||
auto& chunk = memory_flow.chunks[event.tensor_id]; | |||
uint64_t address = reinterpret_cast<uintptr_t>(event.ptr); | |||
auto span = event.layout.span(); | |||
auto dtype = event.layout.dtype; | |||
// assume dtype is not lowbit | |||
if (!address) { | |||
chunk.address = {0, 0}; | |||
} else { | |||
chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()}; | |||
} | |||
chunk.layout = event.layout; | |||
chunk.time[0] = since_start(to_device_time(current->time, current_tensor->device)); | |||
chunk.name = current_tensor->name; | |||
chunk.group = current_tensor->source; | |||
} else if constexpr (std::is_same_v<TEvent, TensorReleaseEvent>) { | |||
auto& chunk = memory_flow.chunks[event.tensor_id]; | |||
chunk.time[1] = since_start(to_device_time(current->time, current_tensor->device)); | |||
} | |||
chunk.layout = tensor_state.layout; | |||
chunk.time[0] = time; | |||
chunk.name = tensor_state.name; | |||
}); | |||
HANDLE_EVENT(TensorReleaseEvent, { | |||
auto& tensor_state = state.tensors[event.tensor_id]; | |||
state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()}); | |||
state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced}); | |||
auto& chunk = flow.chunks[event.tensor_id]; | |||
chunk.time[1] = time; | |||
}); | |||
HANDLE_EVENT(ScopeEvent, { | |||
state.threads[tid].scope_stack.push_back(event.name); | |||
}); | |||
HANDLE_EVENT(ScopeFinishEvent, { | |||
mgb_assert(state.threads[tid].scope_stack.back() == event.name); | |||
state.threads[tid].scope_stack.pop_back(); | |||
}); | |||
for (auto&& result: results) { | |||
collector(result.second.id, result.first, result.second.time, result.second.data); | |||
} | |||
debug::write_to_file(filename.c_str(), flow.to_svg().to_string()); | |||
void notify_counter(std::string key, int64_t old_val, int64_t new_val) {} | |||
}; | |||
void dump_memory_flow(std::string filename, Profiler::bundle_t result) { | |||
MemoryFlowVisitor visitor; | |||
visitor.process_events(std::move(result)); | |||
debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string()); | |||
} | |||
} |
@@ -3,6 +3,9 @@ | |||
#include <set> | |||
#include <any> | |||
#include <typeindex> | |||
#include <sstream> | |||
#include "nlohmann/json.hpp" | |||
#include "megbrain/tensor.h" | |||
@@ -10,24 +13,16 @@ | |||
namespace mgb::imperative::profiler { | |||
struct ProfileDeviceState { | |||
int64_t index; | |||
CompNode device; | |||
std::shared_ptr<CompNode::Event> base_event; | |||
uint64_t base_time; //in ns | |||
}; | |||
struct ProfileWorkerState { | |||
}; | |||
using StackManager = interpreter::intl::StackManager; | |||
struct ProfileTensorState { | |||
uint64_t id; | |||
uint64_t id = 0; | |||
std::optional<uint64_t> source; | |||
TensorLayout layout; | |||
CompNode device; | |||
std::string name; | |||
uint64_t produced = 0; | |||
uint64_t living_time = 0; | |||
profiler::HostTime produced = profiler::HostTime::min(); | |||
profiler::Duration living_time = profiler::Duration::zero(); | |||
size_t size_in_bytes() const { | |||
if (!layout.dtype.valid()) { | |||
@@ -35,41 +30,51 @@ struct ProfileTensorState { | |||
} | |||
return layout.dtype.size(layout.total_nr_elems()); | |||
} | |||
}; | |||
struct ProfileStaticsState { | |||
size_t op_enqueue_count = 0; | |||
size_t op_execute_count = 0; | |||
size_t wait_value_count = 0; | |||
size_t wait_shape_count = 0; | |||
size_t exception_count = 0; | |||
size_t infer_shape_valid_count = 0; | |||
size_t infer_shape_invalid_count = 0; | |||
size_t alive_tensor_count = 0; | |||
size_t produce_tensor_count = 0; | |||
size_t erase_tensor_count = 0; | |||
size_t wait_prop_count = 0; | |||
size_t redundant_tensor_count = 0; | |||
std::string info(HostTime current_time) { | |||
std::string shape = layout.TensorShape::to_string(); | |||
std::string dtype = layout.dtype.name(); | |||
return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.to_string().c_str()); | |||
} | |||
nlohmann::json detail(HostTime current_time) { | |||
nlohmann::json args; | |||
args["id"] = id; | |||
args["name"] = name; | |||
args["shape"] = layout.TensorShape::to_string(); | |||
args["dtype"] = layout.dtype.name(); | |||
args["nr_elements"] = layout.total_nr_elems(); | |||
args["device"] = device.to_string(); | |||
if (produced != produced.min()) { | |||
double ms_count = std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(current_time - produced + living_time).count(); | |||
args["living_time"] = ssprintf("%lf ms", ms_count); | |||
} | |||
return args; | |||
} | |||
}; | |||
struct ProfileOperatorState { | |||
uint64_t id; | |||
uint64_t id = 0; | |||
std::string name; | |||
OpParams params; | |||
SmallVector<uint64_t> inputs; | |||
SmallVector<uint64_t> outputs; | |||
CompNode device; | |||
Trace trace; | |||
uint64_t host_begin; | |||
uint64_t host_end; | |||
std::shared_ptr<CompNode::Event> device_begin; | |||
std::shared_ptr<CompNode::Event> device_end; | |||
}; | |||
profiler::HostTime execute_begin; | |||
profiler::HostTime execute_end; | |||
struct ProfileThreadState { | |||
std::thread::id tid; | |||
int64_t index; | |||
std::vector<std::string> scope_stack; | |||
nlohmann::json detail() { | |||
nlohmann::json args; | |||
for (auto&& [name, value]: params) { | |||
args[name] = value; | |||
} | |||
args["__id__"] = id; | |||
args["__name__"] = name; | |||
args["__device__"] = device.to_string(); | |||
return args; | |||
} | |||
}; | |||
template <typename TProp> | |||
@@ -93,37 +98,12 @@ struct ProfileTensorPropPair { | |||
using ProfileTensorSizePair = ProfileTensorPropPair<size_t>; | |||
using ProfileTensorProducedPair = ProfileTensorPropPair<uint64_t>; | |||
struct GeneralTensorEvent { | |||
uint64_t tensor_id; | |||
std::type_index type; | |||
}; | |||
struct ProfileState { | |||
std::unordered_map<uint64_t, ProfileTensorState> tensors; | |||
std::unordered_map<uint64_t, ProfileOperatorState> operators; | |||
std::unordered_map<std::string, uint64_t> tensor_name_counter; | |||
std::set<ProfileTensorSizePair> tensors_by_size; | |||
std::set<ProfileTensorSizePair> tensors_by_produced; | |||
ProfileWorkerState worker; | |||
ProfileStaticsState statics; | |||
std::unordered_map<std::thread::id, ProfileThreadState> threads; | |||
CompNode::UnorderedMap<ProfileDeviceState> devices; | |||
ProfileThreadState& operator[](std::thread::id tid) { | |||
if (threads.count(tid) == 0) { | |||
threads[tid].tid = tid; | |||
threads[tid].index = threads.size(); | |||
} | |||
return threads[tid]; | |||
} | |||
ProfileDeviceState& operator[](CompNode device) { | |||
if (devices.count(device) == 0) { | |||
devices[device].device = device; | |||
devices[device].index = devices.size(); | |||
} | |||
return devices[device]; | |||
} | |||
std::vector<uint64_t> top_k_tensor_in_device(CompNode device, size_t k) { | |||
std::vector<uint64_t> results; | |||
@@ -138,19 +118,233 @@ struct ProfileState { | |||
} | |||
return results; | |||
} | |||
}; | |||
std::string concat_scope(std::thread::id tid) { | |||
auto& scope_stack = threads[tid].scope_stack; | |||
if (scope_stack.empty()) { | |||
return {}; | |||
} | |||
std::string result = scope_stack[0]; | |||
for (size_t i = 1; i < scope_stack.size(); ++i) { | |||
result += "::"; | |||
result += scope_stack[i]; | |||
template<typename T, typename = void> | |||
struct is_op_event : std::false_type { }; | |||
template<typename T> | |||
struct is_op_event<T, decltype(std::declval<T>().op_id, void())> : std::true_type { }; | |||
template<typename T, typename = void> | |||
struct is_tensor_event : std::false_type { }; | |||
template<typename T> | |||
struct is_tensor_event<T, decltype(std::declval<T>().tensor_id, void())> : std::true_type { }; | |||
template<typename T, typename = void> | |||
struct is_trace_event : std::false_type { }; | |||
template<typename T> | |||
struct is_trace_event<T, decltype(std::declval<T>().trace, void())> : std::true_type { }; | |||
template <typename... TItems> | |||
class AnyToVariantConverter { | |||
public: | |||
using any_t = std::any; | |||
using variant_t = std::variant<TItems...>; | |||
private: | |||
std::unordered_map<std::type_index, std::function<variant_t(any_t)>> m_table; | |||
template <typename TItem> | |||
void register_converter() { | |||
m_table[typeid(TItem)] = [](any_t input) { | |||
return variant_t(std::any_cast<TItem>(std::move(input))); | |||
}; | |||
} | |||
public: | |||
AnyToVariantConverter() { | |||
(register_converter<TItems>(), ...); | |||
} | |||
variant_t operator()(any_t input) { | |||
return m_table[input.type()](std::move(input)); | |||
} | |||
}; | |||
template <typename TSelf> | |||
class EventVisitor { | |||
private: | |||
std::unordered_map<size_t, ProfileOperatorState> m_operators; | |||
std::unordered_map<size_t, ProfileTensorState> m_tensors; | |||
std::unordered_map<size_t, std::vector<Profiler::Record>> m_duration_stack; | |||
HostTime m_start_time; | |||
CompNode::UnorderedMap<size_t> m_device_tid_table; | |||
std::unordered_map<std::thread::id, size_t> m_host_tid_table; | |||
CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> m_device_timeline; | |||
std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack; | |||
std::unordered_map<std::string, int64_t> m_counter_table; | |||
protected: | |||
Profiler::Record* current; | |||
ProfileOperatorState* current_op; | |||
ProfileTensorState* current_tensor; | |||
protected: | |||
profiler::Duration since_start(profiler::HostTime time) { | |||
return time - m_start_time; | |||
} | |||
profiler::HostTime to_device_time(profiler::HostTime time, CompNode device) { | |||
auto& device_timeline = m_device_timeline[device]; | |||
auto upper = device_timeline.lower_bound(time); | |||
if (upper == device_timeline.end()) { | |||
if (upper == device_timeline.begin()) { | |||
return time; | |||
} else { | |||
--upper; | |||
return time + std::chrono::duration_cast<profiler::Duration>(upper->second); | |||
} | |||
} else if (upper->first == time) { | |||
return time + std::chrono::duration_cast<profiler::Duration>(upper->second); | |||
} else if (upper == device_timeline.begin()) { | |||
return time + std::chrono::duration_cast<profiler::Duration>(upper->second); | |||
} | |||
auto lower = upper; | |||
-- lower; | |||
double ratio = ((double)(time - lower->first).count() / (double)(upper->first - lower->first).count()); | |||
mgb_assert(ratio > 0 && ratio < 1, "invalid ratio"); | |||
mgb_assert(lower->first + lower->second <= upper->first + upper->second, "device time corr"); | |||
auto shift = lower->second + ratio * (upper->second - lower->second); | |||
auto result = time + std::chrono::duration_cast<profiler::Duration>(shift); | |||
return result; | |||
} | |||
size_t to_tid(std::thread::id host_tid) { | |||
return m_host_tid_table.at(host_tid); | |||
} | |||
size_t to_tid(CompNode device) { | |||
return m_device_tid_table.at(device); | |||
} | |||
void inc_counter(const char* key, int64_t delta) { | |||
if (!m_counter_table.count(key)) { | |||
m_counter_table[key] = 0; | |||
} | |||
auto& value = m_counter_table[key]; | |||
static_cast<TSelf&>(*this).notify_counter(key, value, value + delta); | |||
value += delta; | |||
} | |||
public: | |||
void process_events(Profiler::bundle_t bundle) { | |||
m_start_time = bundle.start_at; | |||
auto& self = static_cast<TSelf&>(*this); | |||
AnyToVariantConverter<OpDispatchEvent, OpExecuteEvent, OpExecuteFinishEvent, | |||
KernelLaunchEvent, KernelLaunchFinishEvent, | |||
OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent, | |||
TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent, | |||
TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent, | |||
SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, | |||
StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent, | |||
TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent, | |||
CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent> converter; | |||
auto for_each_entry = [&](auto&& handler) { | |||
for (auto& entry: bundle.entries) { | |||
current = &entry; | |||
std::visit(handler, converter(entry.data)); | |||
} | |||
current = nullptr; | |||
}; | |||
// build device timeline | |||
struct DeviceStartPair { | |||
profiler::HostTime host; | |||
std::shared_ptr<CompNode::Event> device; | |||
}; | |||
CompNode::UnorderedMap<DeviceStartPair> device_start_table; | |||
for_each_entry([&](auto&& event){ | |||
using T = std::decay_t<decltype(event)>; | |||
if constexpr (std::is_same_v<T, RecordDeviceEvent>) { | |||
using namespace std::chrono_literals; | |||
DeviceStartPair& device_start = device_start_table[event.event->comp_node()]; | |||
if (!device_start.device) { | |||
device_start = { current->time, event.event }; | |||
} | |||
event.event->host_wait(); | |||
auto device_time = (device_start.host - current->time) + std::chrono::duration_cast<profiler::RealDuration>(device_start.device->elapsed_time_until(*event.event) * 1s); | |||
m_device_timeline[event.event->comp_node()][current->time] = device_time; | |||
} | |||
}); | |||
// register host threads | |||
for_each_entry([&](auto&& event){ | |||
if (!m_host_tid_table.count(current->tid)) { | |||
m_host_tid_table[current->tid] = {m_device_tid_table.size() + m_host_tid_table.size()}; | |||
} | |||
}); | |||
for_each_entry([&](auto&& event){ | |||
using T = std::decay_t<decltype(event)>; | |||
if constexpr (std::is_same_v<T, OpDispatchEvent>) { | |||
auto& op = m_operators[event.op_id]; | |||
mgb_assert(op.id == 0, "duplicate operator id"); | |||
op.id = event.op_id; | |||
op.name = event.op_name; | |||
op.params = event.op_params(); | |||
op.inputs = event.inputs; | |||
op.outputs = event.outputs; | |||
op.trace = event.trace; | |||
for (auto&& output: event.outputs) { | |||
m_tensors.at(output).source = op.id; | |||
} | |||
} else if constexpr (std::is_same_v<T, TensorDeclareEvent>) { | |||
auto& tensor = m_tensors[event.tensor_id]; | |||
mgb_assert(tensor.id == 0, "duplicated tensor id"); | |||
tensor.id = event.tensor_id; | |||
tensor.name = event.name; | |||
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | |||
auto& tensor = m_tensors.at(event.tensor_id); | |||
if (!m_device_tid_table.count(event.device)) { | |||
m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; | |||
} | |||
tensor.device = event.device; | |||
} | |||
}); | |||
// replay execution | |||
using namespace std::placeholders; | |||
for_each_entry([&](auto&& event){ | |||
using T = std::decay_t<decltype(event)>; | |||
// update current_op/tensor | |||
if constexpr (is_op_event<T>::value) { | |||
current_op = &m_operators.at(event.op_id); | |||
} else if constexpr (is_tensor_event<T>::value) { | |||
current_tensor = &m_tensors.at(event.tensor_id); | |||
} | |||
if constexpr (std::is_same_v<T, OpExecuteEvent>) { | |||
current_op->execute_begin = current->time; | |||
} else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) { | |||
current_op->execute_end = current->time; | |||
} | |||
// update counters | |||
if constexpr (std::is_same_v<T, OpDispatchEvent>) { | |||
inc_counter("nr_op_pending", 1); | |||
} else if constexpr (std::is_same_v<T, OpExecuteEvent>) { | |||
inc_counter("nr_op_pending", -1); | |||
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | |||
inc_counter("nr_alive_tensor", 1); | |||
} else if constexpr (std::is_same_v<T, TensorReleaseEvent>) { | |||
inc_counter("nr_alive_tensor", -1); | |||
} else if constexpr (std::is_same_v<T, TensorEraseEvent>) { | |||
if (event.use_count == 0) { | |||
inc_counter("nr_redunant_tensor", 1); | |||
} | |||
} else if constexpr (std::is_same_v<T, ShapeInferEvent>) { | |||
if (!event.success) { | |||
inc_counter("nr_shape_infer_failure", 1); | |||
} | |||
} else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) { | |||
inc_counter("nr_exception", 1); | |||
} | |||
// visit_event_impl | |||
self.visit_event(event); | |||
// reset current_op/tensor | |||
if constexpr (is_op_event<T>::value) { | |||
current_op = nullptr; | |||
} else if constexpr (is_tensor_event<T>::value) { | |||
current_tensor = nullptr; | |||
} | |||
}); | |||
} | |||
}; | |||
} |
@@ -25,6 +25,9 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { | |||
auto on_seq_start = [this](CompSeqExecBeforeStart const& event) { | |||
// reset | |||
mgb_assert(!event.graph->options().imperative_proxy_graph); | |||
CompNode::foreach([](CompNode device){ | |||
Profiler::record<RecordDeviceEvent>(Timer::record_device(device)); | |||
}); | |||
if (m_opr_dict.empty() && m_var_dict.empty()) { | |||
init_seq(event.exec); | |||
} | |||
@@ -122,11 +125,13 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { | |||
}; | |||
auto on_before_kern = [this](BeforeKernel const& event) { | |||
OperatorNodeBase* opr = event.opr; | |||
Profiler::record<KernelExecuteEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node)); | |||
Profiler::record<KernelLaunchEvent>(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node); | |||
Profiler::record<RecordDeviceEvent>(Timer::record_device(event.comp_node)); | |||
}; | |||
auto on_after_kern = [this](AfterKernel const& event) { | |||
OperatorNodeBase* opr = event.opr; | |||
Profiler::record<KernelExecuteFinishEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node)); | |||
Profiler::record<RecordDeviceEvent>(Timer::record_device(event.comp_node)); | |||
Profiler::record<KernelLaunchEvent>(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node); | |||
}; | |||
auto on_graph_compile = [this](const CompSeqOrderDetermined&) { | |||
m_opr_dict.clear(); | |||
@@ -32,15 +32,22 @@ | |||
namespace mgb { | |||
namespace imperative { | |||
namespace profiler { | |||
using HostTime = std::chrono::time_point<std::chrono::high_resolution_clock>; | |||
using Duration = std::chrono::nanoseconds; | |||
using RealDuration = std::chrono::duration<double, std::nano>; | |||
using Time = HostTime; | |||
} // namespace profiler | |||
class Timer { | |||
public: | |||
void reset(); | |||
uint64_t get_nsecs(); | |||
uint64_t get_started_at(); | |||
static std::shared_ptr<CompNode::Event> record_event(CompNode device); | |||
private: | |||
decltype(std::chrono::steady_clock::now()) m_start; | |||
uint64_t m_started_at; | |||
using Time = profiler::Time; | |||
static profiler::Time record_host(); | |||
static std::shared_ptr<CompNode::Event> record_device(CompNode device); | |||
}; | |||
@@ -48,7 +55,8 @@ class Profiler { | |||
public: | |||
struct Record { | |||
uint64_t id; | |||
uint64_t time; //in ns | |||
std::thread::id tid; | |||
profiler::Time time; | |||
std::any data; | |||
}; | |||
enum Status: uint8_t { | |||
@@ -56,23 +64,32 @@ public: | |||
Recording = 1, | |||
Collecting = 2, | |||
}; | |||
using ProfileCollector = std::function<void(std::thread::id, Record)>; | |||
struct ResultBundle; | |||
using ProfileCollector = std::function<void(Record)>; | |||
using option_t = uint64_t; | |||
using options_t = std::unordered_map<std::string, option_t>; | |||
using result_t = std::pair<std::thread::id, Record>; | |||
using results_t = std::vector<result_t>; | |||
using entry_t = Record; | |||
using bundle_t = ResultBundle; | |||
using thread_dict_t = std::unordered_map<std::thread::id, std::string>; | |||
struct ResultBundle { | |||
profiler::HostTime start_at; | |||
thread_dict_t thread_dict; | |||
options_t options; | |||
std::vector<entry_t> entries; | |||
}; | |||
private: | |||
std::thread::id m_thread_id; | |||
std::vector<Record> m_records; | |||
std::vector<std::any> m_duration_stack; | |||
std::atomic<Status> m_status = Running; | |||
uint64_t m_last_time = 0; | |||
std::string m_thread_name; | |||
static options_t sm_profile_options; | |||
static std::mutex sm_mutex; | |||
static std::unordered_map<std::thread::id, Profiler*> sm_profilers; | |||
static Timer sm_timer; | |||
static profiler::HostTime sm_start_at; | |||
static std::atomic_uint64_t sm_last_id; | |||
static std::atomic_size_t sm_preferred_capacity; | |||
static bool sm_profiling; | |||
@@ -100,7 +117,7 @@ public: | |||
static void reset() { | |||
mgb_assert(sm_profilers.size() == 0, "profiler already running"); | |||
sm_timer.reset(); | |||
sm_start_at = profiler::HostTime::min(); | |||
} | |||
static uint64_t next_id() { | |||
@@ -110,16 +127,13 @@ public: | |||
template <typename T, typename... TArgs> | |||
static uint64_t record(TArgs&&... args) { | |||
auto& profiler = get_instance(); | |||
auto last_time = profiler.m_last_time; | |||
if constexpr (sm_debug) { | |||
Status expected = Running; | |||
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); | |||
} | |||
uint64_t id = next_id(); | |||
uint64_t time = sm_timer.get_nsecs(); | |||
time = std::max(time, last_time + 2000); | |||
profiler.m_last_time = time; | |||
profiler.m_records.push_back({id, time, T{std::forward<TArgs>(args)...}}); | |||
profiler::Time time = sm_timer.record_host(); | |||
profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward<TArgs>(args)...}}); | |||
if constexpr (sm_debug) { | |||
Status expected = Recording; | |||
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); | |||
@@ -127,7 +141,8 @@ public: | |||
return id; | |||
} | |||
static results_t collect() { | |||
static bundle_t collect() { | |||
bundle_t bundle; | |||
MGB_LOCK_GUARD(sm_mutex); | |||
if constexpr (sm_debug) { | |||
for (auto&& [tid, profiler]: sm_profilers) { | |||
@@ -136,17 +151,17 @@ public: | |||
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting)); | |||
} | |||
} | |||
std::vector<std::pair<std::thread::id, Record>> profile_data; | |||
std::vector<entry_t> profile_data; | |||
for (auto&& [tid, profiler]: sm_profilers) { | |||
sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); | |||
for (auto& record: profiler->m_records) { | |||
profile_data.push_back({tid, std::move(record)}); | |||
profile_data.push_back(std::move(record)); | |||
} | |||
profiler->m_records.clear(); | |||
profiler->m_records.reserve(sm_preferred_capacity); | |||
} | |||
std::sort(profile_data.begin(), profile_data.end(), [](auto& lhs, auto& rhs){ | |||
return lhs.second.id < rhs.second.id; | |||
return lhs.id < rhs.id; | |||
}); | |||
if constexpr (sm_debug) { | |||
for (auto&& [tid, profiler]: sm_profilers) { | |||
@@ -155,7 +170,11 @@ public: | |||
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); | |||
} | |||
} | |||
return profile_data; | |||
bundle.entries = profile_data; | |||
bundle.options = get_options(); | |||
bundle.start_at = sm_start_at; | |||
bundle.thread_dict = get_thread_dict(); | |||
return bundle; | |||
} | |||
static option_t get_option(std::string key, option_t default_val) { | |||
@@ -179,6 +198,7 @@ public: | |||
static void start_profile() { | |||
mgb_assert(!sm_profiling); | |||
sm_start_at = Timer::record_host(); | |||
sm_profiling = true; | |||
} | |||
@@ -189,7 +209,7 @@ public: | |||
static thread_dict_t get_thread_dict(); | |||
static void dump_profile(std::string basename, std::string format, results_t results, options_t options); | |||
static void dump_profile(std::string basename, std::string format, bundle_t result); | |||
}; | |||
@@ -19,7 +19,7 @@ target_include_directories(imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHO | |||
target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C) | |||
target_compile_options(imperative_test PRIVATE -Wno-unused-parameter) | |||
set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3) | |||
set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3 nlohmann_json::nlohmann_json) | |||
if(MGE_WITH_CUDA) | |||
list(APPEND LINK_LIBS cudart) | |||