@@ -39,6 +39,9 @@ endif() | |||||
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3) | add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 ${PROJECT_BINARY_DIR}/third_party/range-v3) | ||||
target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | ||||
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/Json ${PROJECT_BINARY_DIR}/third_party/Json) | |||||
target_link_libraries(${MODULE_NAME} PRIVATE nlohmann_json::nlohmann_json) | |||||
target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR}) | target_include_directories(${MODULE_NAME} PUBLIC src/include PRIVATE ${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDE_DIR} ${MGB_OPDEF_OUT_DIR}) | ||||
target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME}) | target_compile_definitions(${MODULE_NAME} PRIVATE MODULE_NAME=${MODULE_NAME}) | ||||
target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter) | target_compile_options(${MODULE_NAME} PRIVATE -Wno-unused-parameter) | ||||
@@ -1013,9 +1013,8 @@ void init_tensor(py::module m) { | |||||
interpreter_for_py->sync(); | interpreter_for_py->sync(); | ||||
imperative::Profiler::stop_profile(); | imperative::Profiler::stop_profile(); | ||||
auto results = imperative::Profiler::collect(); | auto results = imperative::Profiler::collect(); | ||||
auto options = imperative::Profiler::get_options(); | |||||
return [results=std::move(results), options=std::move(options)](std::string basename, std::string format){ | |||||
imperative::Profiler::dump_profile(basename, format, results, options); | |||||
return [results=std::move(results)](std::string basename, std::string format){ | |||||
imperative::Profiler::dump_profile(basename, format, results); | |||||
}; | }; | ||||
}, py::call_guard<py::gil_scoped_release>()); | }, py::call_guard<py::gil_scoped_release>()); | ||||
m.def("sync", | m.def("sync", | ||||
@@ -19,6 +19,7 @@ | |||||
#include "megbrain/imperative/op_def.h" | #include "megbrain/imperative/op_def.h" | ||||
#include "megbrain/imperative/utils/to_string.h" | #include "megbrain/imperative/utils/to_string.h" | ||||
#include "./stack_manager.h" | |||||
#include "./tensor_info.h" | #include "./tensor_info.h" | ||||
namespace mgb::imperative { | namespace mgb::imperative { | ||||
@@ -193,7 +194,7 @@ struct PopScope { | |||||
} | } | ||||
}; | }; | ||||
using Command = std::variant<Put, | |||||
using CommandData = std::variant<Put, | |||||
ApplyOp, | ApplyOp, | ||||
Del, | Del, | ||||
GetValue, | GetValue, | ||||
@@ -206,14 +207,20 @@ using Command = std::variant<Put, | |||||
PushScope, | PushScope, | ||||
PopScope>; | PopScope>; | ||||
using IdentifiedCommand = std::pair<uint64_t, Command>; | |||||
struct Command { | |||||
uint64_t id; | |||||
CommandData data; | |||||
StackManager::Trace trace; | |||||
}; | |||||
// using IdentifiedCommand = std::pair<uint64_t, Command>; | |||||
} | } | ||||
template <> | template <> | ||||
struct ToStringTrait<interpreter::intl::Command>{ | struct ToStringTrait<interpreter::intl::Command>{ | ||||
std::string operator()(const interpreter::intl::Command& cmd) const { | std::string operator()(const interpreter::intl::Command& cmd) const { | ||||
return std::visit([](const auto& cmd){ | |||||
std::string content = std::visit([](const auto& cmd){ | |||||
std::string result = cmd.get_name(); | std::string result = cmd.get_name(); | ||||
result += "{"; | result += "{"; | ||||
cmd.get_props([&](const char* key, auto&& value) { | cmd.get_props([&](const char* key, auto&& value) { | ||||
@@ -224,7 +231,8 @@ struct ToStringTrait<interpreter::intl::Command>{ | |||||
}); | }); | ||||
result += "}"; | result += "}"; | ||||
return result; | return result; | ||||
}, cmd); | |||||
}, cmd.data); | |||||
return content; | |||||
} | } | ||||
}; | }; | ||||
@@ -127,9 +127,8 @@ Handle ChannelImpl::put(const HostTensorND& value, bool no_cache) { | |||||
MGB_LOCK_GUARD(m_spin); | MGB_LOCK_GUARD(m_spin); | ||||
mgb_assert(check_available(), "Channel already closed"); | mgb_assert(check_available(), "Channel already closed"); | ||||
auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
state.scopes.push("Put"); | |||||
auto _ = StackManager::Guard{"Put", &state.stack_manager}; | |||||
auto info = put_impl(value, no_cache); | auto info = put_impl(value, no_cache); | ||||
state.scopes.pop("Put"); | |||||
return info; | return info; | ||||
} | } | ||||
@@ -158,16 +157,15 @@ Handle ChannelImpl::put(const DeviceTensorND& data, const HostTensorND& hvalue) | |||||
} | } | ||||
TensorInfo* ChannelImpl::put_impl(const DeviceTensorND& data, const HostTensorND& hvalue) { | TensorInfo* ChannelImpl::put_impl(const DeviceTensorND& data, const HostTensorND& hvalue) { | ||||
auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
state.scopes.push("Put"); | |||||
auto _ = StackManager::Guard{"Put", &state.stack_manager}; | |||||
auto info = alloc(); | auto info = alloc(); | ||||
RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandEvent::Put); | |||||
RECORD_EVENT(TensorCommandEvent, info->id, TensorCommandKind::Put); | |||||
init(info, {data.layout(), data.comp_node()}); | init(info, {data.layout(), data.comp_node()}); | ||||
info->mem_desc.id = StorageIdentifier::make(++m_storage_id); | info->mem_desc.id = StorageIdentifier::make(++m_storage_id); | ||||
info->ptr = Tensor::make(data, hvalue); | info->ptr = Tensor::make(data, hvalue); | ||||
RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr()); | RECORD_EVENT(TensorProduceEvent, info->id, info->desc.layout, info->desc.comp_node, data.raw_ptr()); | ||||
info->status = TensorInfo::Produced; | info->status = TensorInfo::Produced; | ||||
RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandFinishEvent::Put); | |||||
state.scopes.pop("Put"); | |||||
RECORD_EVENT(TensorCommandFinishEvent, info->id, TensorCommandKind::Put); | |||||
return info; | return info; | ||||
} | } | ||||
@@ -230,7 +228,7 @@ void ChannelImpl::dispatch_default_cpu( | |||||
auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
auto name = op->trait()->make_name(*op); | auto name = op->trait()->make_name(*op); | ||||
state.scopes.push(name); | |||||
auto _ = StackManager::Guard(name, &state.stack_manager); | |||||
auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); | auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); | ||||
RECORD_EVENT(ShapeInferEvent, validated); | RECORD_EVENT(ShapeInferEvent, validated); | ||||
@@ -291,9 +289,9 @@ void ChannelImpl::dispatch_default_cpu( | |||||
} | } | ||||
return op_info; | return op_info; | ||||
}; | }; | ||||
RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, tinfo_to_tid(input_infos), tinfo_to_tid(output_infos)); | |||||
state.scopes.pop(name); | |||||
RECORD_EVENT(OpDispatchEvent, op_id, op->trait()->name, op_info_getter, | |||||
tinfo_to_tid(input_infos), tinfo_to_tid(output_infos), | |||||
state.stack_manager.dump()); | |||||
} | } | ||||
void ChannelImpl::dispatch_kernel( | void ChannelImpl::dispatch_kernel( | ||||
@@ -305,7 +303,7 @@ void ChannelImpl::dispatch_kernel( | |||||
auto& options = state.options; | auto& options = state.options; | ||||
auto name = op->trait()->make_name(*op); | auto name = op->trait()->make_name(*op); | ||||
state.scopes.push(name); | |||||
auto _ = StackManager::Guard{name, &state.stack_manager}; | |||||
auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); | auto [output_descs, validated] = OpDef::infer_output_attrs_fallible(*op, input_descs); | ||||
RECORD_EVENT(ShapeInferEvent, validated); | RECORD_EVENT(ShapeInferEvent, validated); | ||||
@@ -334,7 +332,9 @@ void ChannelImpl::dispatch_kernel( | |||||
} | } | ||||
return op_info; | return op_info; | ||||
}; | }; | ||||
RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs)); | |||||
RECORD_EVENT(OpDispatchEvent, cmd.id, cmd.op->trait()->name, op_info_getter, | |||||
tinfo_to_tid(cmd.inputs), tinfo_to_tid(cmd.outputs), | |||||
state.stack_manager.dump()); | |||||
m_buffer.enqueue(std::move(cmd)); | m_buffer.enqueue(std::move(cmd)); | ||||
if (!validated && options.async_level == 1) { | if (!validated && options.async_level == 1) { | ||||
sync_impl(); | sync_impl(); | ||||
@@ -346,7 +346,6 @@ void ChannelImpl::dispatch_kernel( | |||||
info->ptr->comp_node().sync(); | info->ptr->comp_node().sync(); | ||||
} | } | ||||
} | } | ||||
state.scopes.pop(name); | |||||
} | } | ||||
SmallVector<Handle> ChannelImpl::apply_op( | SmallVector<Handle> ChannelImpl::apply_op( | ||||
@@ -505,7 +504,8 @@ TensorInfo* ChannelImpl::alloc() { | |||||
}(); | }(); | ||||
info->id = Profiler::next_id(); | info->id = Profiler::next_id(); | ||||
if (Profiler::is_profiling()) { | if (Profiler::is_profiling()) { | ||||
info->name = state.scopes.next_tensor_name(); | |||||
size_t tensor_id = state.stack_manager.current()->next_id("tensor"); | |||||
info->name = state.stack_manager.dump().to_string() + ssprintf(":%zu", tensor_id); | |||||
} | } | ||||
return info; | return info; | ||||
} | } | ||||
@@ -554,7 +554,7 @@ void ChannelImpl::free(TensorInfo* ptr) { | |||||
} | } | ||||
void ChannelImpl::recursive_free(TensorInfo* ptr) { | void ChannelImpl::recursive_free(TensorInfo* ptr) { | ||||
RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandEvent::RecFree); | |||||
RECORD_EVENT(TensorCommandEvent, ptr->id, TensorCommandKind::RecFree); | |||||
SmallVector<TensorInfo*> inps; | SmallVector<TensorInfo*> inps; | ||||
if (ptr->producer) { | if (ptr->producer) { | ||||
for (auto i : ptr->producer->inputs) { | for (auto i : ptr->producer->inputs) { | ||||
@@ -569,7 +569,7 @@ void ChannelImpl::recursive_free(TensorInfo* ptr) { | |||||
recursive_free(i); | recursive_free(i); | ||||
} | } | ||||
} | } | ||||
RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandFinishEvent::RecFree); | |||||
RECORD_EVENT(TensorCommandFinishEvent, ptr->id, TensorCommandKind::RecFree); | |||||
} | } | ||||
void ChannelImpl::real_free(TensorInfo* ptr) { | void ChannelImpl::real_free(TensorInfo* ptr) { | ||||
@@ -625,9 +625,9 @@ void ChannelImpl::regenerate(TensorInfo* dest) { | |||||
m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest}); | m_apply_stack.push({ApplyOp{path->id, path->op, path->inputs, path->outputs, {}}, 0, dest}); | ||||
if (!m_applying) flush_apply_stack(); | if (!m_applying) flush_apply_stack(); | ||||
} else if (dest->evict_type == EvictType::SWAP) { | } else if (dest->evict_type == EvictType::SWAP) { | ||||
RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandEvent::ReGen); | |||||
RECORD_EVENT(TensorCommandEvent, dest->id, TensorCommandKind::ReGen); | |||||
produce_tensor(dest, Tensor::make(dest->h_value)); | produce_tensor(dest, Tensor::make(dest->h_value)); | ||||
RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandFinishEvent::ReGen); | |||||
RECORD_EVENT(TensorCommandFinishEvent, dest->id, TensorCommandKind::ReGen); | |||||
} | } | ||||
} | } | ||||
@@ -721,22 +721,24 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { | |||||
// refcnt --, owners: [tensor_inputs] | // refcnt --, owners: [tensor_inputs] | ||||
// if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor | // if it's decreased to 1, would be detected at @see: proxy_graph_detail::apply_on_physical_tensor | ||||
uint64_t del_id = del->id; | uint64_t del_id = del->id; | ||||
RECORD_EVENT(OpDelEvent, del_id); | |||||
RECORD_EVENT(TensorCommandEvent, del_id, TensorCommandKind::Del); | |||||
free(del); | free(del); | ||||
RECORD_EVENT(OpDelFinishEvent, del_id); | |||||
RECORD_EVENT(TensorCommandFinishEvent, del_id, TensorCommandKind::Del); | |||||
} | } | ||||
// Before wait | // Before wait | ||||
//TODO: split operator wait and execute so that OpWait could be corrected recorded. | //TODO: split operator wait and execute so that OpWait could be corrected recorded. | ||||
// Before execute | // Before execute | ||||
for (auto&& [device, kernel_id]: kernels) { | for (auto&& [device, kernel_id]: kernels) { | ||||
RECORD_EVENT(KernelExecuteEvent, apply_id, kernel_id, Timer::record_event(device)); | |||||
RECORD_EVENT(KernelLaunchEvent, apply_id, kernel_id, device); | |||||
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); | |||||
} | } | ||||
// Apply op | // Apply op | ||||
// Here std::move is REQUIRED for removing duplicated references. | // Here std::move is REQUIRED for removing duplicated references. | ||||
auto outputs = apply_on_physical_tensor(apply_on_physical_tensor, *cmd.op, inputs); | auto outputs = apply_on_physical_tensor(apply_on_physical_tensor, *cmd.op, inputs); | ||||
// After execute | // After execute | ||||
for (auto&& [device, kernel_id]: kernels) { | for (auto&& [device, kernel_id]: kernels) { | ||||
RECORD_EVENT(KernelExecuteFinishEvent, apply_id, kernel_id, Timer::record_event(device)); | |||||
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); | |||||
RECORD_EVENT(KernelLaunchFinishEvent, apply_id, kernel_id, device); | |||||
} | } | ||||
// End profiling operator | // End profiling operator | ||||
mgb_assert(outputs.size() == cmd.outputs.size()); | mgb_assert(outputs.size() == cmd.outputs.size()); | ||||
@@ -787,7 +789,7 @@ void ChannelImpl::flush_apply_stack() { | |||||
m_dtr.pin(cmd.inputs); | m_dtr.pin(cmd.inputs); | ||||
} | } | ||||
if (recomp) { | if (recomp) { | ||||
RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandEvent::ReGen); | |||||
RECORD_EVENT(TensorCommandEvent, recomp->id, TensorCommandKind::ReGen); | |||||
} | } | ||||
} | } | ||||
bool regen = false; | bool regen = false; | ||||
@@ -810,7 +812,7 @@ void ChannelImpl::flush_apply_stack() { | |||||
m_apply_stack.pop(); | m_apply_stack.pop(); | ||||
do_apply_op(cmd_backup); | do_apply_op(cmd_backup); | ||||
if (recomp_backup) { | if (recomp_backup) { | ||||
RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandFinishEvent::ReGen); | |||||
RECORD_EVENT(TensorCommandFinishEvent, recomp_backup->id, TensorCommandKind::ReGen); | |||||
for (auto o : cmd_backup.outputs) { | for (auto o : cmd_backup.outputs) { | ||||
if (o) { | if (o) { | ||||
m_dtr.update_dsu_after_recompute(o); | m_dtr.update_dsu_after_recompute(o); | ||||
@@ -902,7 +904,7 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) { | |||||
check_worker_exc_unsafe(); | check_worker_exc_unsafe(); | ||||
return require_host ? host_available() : static_cast<bool>(info->ptr); | return require_host ? host_available() : static_cast<bool>(info->ptr); | ||||
}); | }); | ||||
RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop, m_waitee == nullptr); | |||||
RECORD_EVENT(TensorWaitPropFinishEvent, info->id, m_waitee_id, prop); | |||||
m_waitee = nullptr; | m_waitee = nullptr; | ||||
return info->ptr; | return info->ptr; | ||||
} | } | ||||
@@ -1003,7 +1005,7 @@ std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPt | |||||
return {outputs_desc, alloc_storage(outputs_desc), alloc_storage(workspaces_desc)}; | return {outputs_desc, alloc_storage(outputs_desc), alloc_storage(workspaces_desc)}; | ||||
} | } | ||||
void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
void ChannelImpl::process_one_task(Command& icmd) { | |||||
using namespace ranges; | using namespace ranges; | ||||
using namespace ranges::views; | using namespace ranges::views; | ||||
auto& state = get_worker_state(); | auto& state = get_worker_state(); | ||||
@@ -1012,10 +1014,12 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
auto cmd_visitor = [&](const auto& cmd) { | auto cmd_visitor = [&](const auto& cmd) { | ||||
using T = std::decay_t<decltype(cmd)>; | using T = std::decay_t<decltype(cmd)>; | ||||
if constexpr (std::is_same_v<T, Put>) { | if constexpr (std::is_same_v<T, Put>) { | ||||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Put); | |||||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Put); | |||||
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); | |||||
auto value = cmd.no_cache ? std::make_shared<Tensor>(cmd.value) : Tensor::make(cmd.value); | auto value = cmd.no_cache ? std::make_shared<Tensor>(cmd.value) : Tensor::make(cmd.value); | ||||
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(cmd.value.comp_node())); | |||||
produce_tensor(cmd.dest, std::move(value)); | produce_tensor(cmd.dest, std::move(value)); | ||||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Put); | |||||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Put); | |||||
sample_on_device(cmd.dest->desc.comp_node, false); | sample_on_device(cmd.dest->desc.comp_node, false); | ||||
} else if constexpr (std::is_same_v<T, ApplyOp>) { | } else if constexpr (std::is_same_v<T, ApplyOp>) { | ||||
for (auto& i : cmd.inputs) { | for (auto& i : cmd.inputs) { | ||||
@@ -1084,11 +1088,11 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
} | } | ||||
} | } | ||||
} else if constexpr (std::is_same_v<T, Del>) { | } else if constexpr (std::is_same_v<T, Del>) { | ||||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Del); | |||||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Del); | |||||
CompNode device = cmd.dest->desc.comp_node; | CompNode device = cmd.dest->desc.comp_node; | ||||
uint64_t tensor_id = cmd.dest->id; | uint64_t tensor_id = cmd.dest->id; | ||||
free(cmd.dest); | free(cmd.dest); | ||||
RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandFinishEvent::Del); | |||||
RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandKind::Del); | |||||
sample_on_device(device, false); | sample_on_device(device, false); | ||||
} else if constexpr (std::is_same_v<T, GetValue>) { | } else if constexpr (std::is_same_v<T, GetValue>) { | ||||
if (cmd.dest->invalid) return; | if (cmd.dest->invalid) return; | ||||
@@ -1102,26 +1106,26 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
imperative_log_profile_end("GetValue"); | imperative_log_profile_end("GetValue"); | ||||
} else if constexpr (std::is_same_v<T, SwapIn>) { | } else if constexpr (std::is_same_v<T, SwapIn>) { | ||||
if (cmd.dest->invalid) return; | if (cmd.dest->invalid) return; | ||||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapIn); | |||||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapIn); | |||||
produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value)); | produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value)); | ||||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapIn); | |||||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapIn); | |||||
sample_on_device(cmd.dest->desc.comp_node, false); | sample_on_device(cmd.dest->desc.comp_node, false); | ||||
} else if constexpr (std::is_same_v<T, SwapOut>) { | } else if constexpr (std::is_same_v<T, SwapOut>) { | ||||
if (cmd.dest->invalid) return; | if (cmd.dest->invalid) return; | ||||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapOut); | |||||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::SwapOut); | |||||
cmd.dest->h_value = cmd.dest->ptr->get_value(); | cmd.dest->h_value = cmd.dest->ptr->get_value(); | ||||
if (cmd.dest->evict_type == EvictType::NONE) { | if (cmd.dest->evict_type == EvictType::NONE) { | ||||
cmd.dest->evict_type = EvictType::SWAP; | cmd.dest->evict_type = EvictType::SWAP; | ||||
cmd.dest->status = TensorInfo::Swapped; | cmd.dest->status = TensorInfo::Swapped; | ||||
release_tensor(cmd.dest); | release_tensor(cmd.dest); | ||||
} | } | ||||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapOut); | |||||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::SwapOut); | |||||
sample_on_device(cmd.dest->desc.comp_node, false); | sample_on_device(cmd.dest->desc.comp_node, false); | ||||
} else if constexpr (std::is_same_v<T, Drop>) { | } else if constexpr (std::is_same_v<T, Drop>) { | ||||
if (cmd.dest->invalid) return; | if (cmd.dest->invalid) return; | ||||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Drop); | |||||
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandKind::Drop); | |||||
do_drop(cmd.dest, true); | do_drop(cmd.dest, true); | ||||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Drop); | |||||
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandKind::Drop); | |||||
} else if constexpr (std::is_same_v<T, SetOption>) { | } else if constexpr (std::is_same_v<T, SetOption>) { | ||||
options.set_option(cmd.key, cmd.value); | options.set_option(cmd.key, cmd.value); | ||||
} else if constexpr (std::is_same_v<T, StartProfile>) { | } else if constexpr (std::is_same_v<T, StartProfile>) { | ||||
@@ -1138,6 +1142,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
if (Profiler::get_option("sample_rate", 0)) { | if (Profiler::get_option("sample_rate", 0)) { | ||||
sample_on_device(device, true); | sample_on_device(device, true); | ||||
} | } | ||||
RECORD_EVENT(RecordDeviceEvent, Timer::record_device(device)); | |||||
}); | }); | ||||
RECORD_EVENT(StartProfileFinishEvent); | RECORD_EVENT(StartProfileFinishEvent); | ||||
} else if constexpr (std::is_same_v<T, StopProfile>) { | } else if constexpr (std::is_same_v<T, StopProfile>) { | ||||
@@ -1186,7 +1191,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { | |||||
notify_tensor_unsafe(m_waitee); | notify_tensor_unsafe(m_waitee); | ||||
} | } | ||||
} | } | ||||
}, icmd.second); | |||||
}, icmd.data); | |||||
} | } | ||||
void ChannelImpl::check_worker_exc_unsafe() { | void ChannelImpl::check_worker_exc_unsafe() { | ||||
@@ -1203,12 +1208,13 @@ void ChannelImpl::check_worker_exc_unsafe() { | |||||
} | } | ||||
} | } | ||||
void ChannelImpl::CommandBuffer::enqueue(Command cmd) { | |||||
void ChannelImpl::CommandBuffer::enqueue(CommandData cmd) { | |||||
auto& state = m_owner->get_channel_state(); | |||||
if (std::get_if<Del>(&cmd) && fuse_del(std::get<Del>(cmd))) { | if (std::get_if<Del>(&cmd) && fuse_del(std::get<Del>(cmd))) { | ||||
return; | return; | ||||
} | } | ||||
// mgb_log_debug("%s Enqueued", to_string(cmd).c_str()); | // mgb_log_debug("%s Enqueued", to_string(cmd).c_str()); | ||||
m_commands.push_back(std::move(cmd)); | |||||
m_commands.push_back({Profiler::next_id(), std::move(cmd), state.stack_manager.dump()}); | |||||
auto flush_pos = flush_pos_for(m_commands.back()); | auto flush_pos = flush_pos_for(m_commands.back()); | ||||
flush(flush_pos); | flush(flush_pos); | ||||
} | } | ||||
@@ -1222,7 +1228,7 @@ void ChannelImpl::CommandBuffer::flush(Handle pos) { | |||||
if (Profiler::is_profiling()) { | if (Profiler::is_profiling()) { | ||||
mgb_log_debug("%s Flushed", to_string(*iter).c_str()); | mgb_log_debug("%s Flushed", to_string(*iter).c_str()); | ||||
} | } | ||||
m_owner->m_worker.add_task(IdentifiedCommand{Profiler::next_id(), std::move(*iter)}); | |||||
m_owner->m_worker.add_task(std::move(*iter)); | |||||
} | } | ||||
m_commands.erase(m_commands.begin(), pos); | m_commands.erase(m_commands.begin(), pos); | ||||
} | } | ||||
@@ -1248,7 +1254,7 @@ auto ChannelImpl::CommandBuffer::flush_pos_for(const Command& cmd) -> Handle { | |||||
return m_commands.begin() + (m_commands.size() - buffer_length); | return m_commands.begin() + (m_commands.size() - buffer_length); | ||||
} | } | ||||
return m_commands.begin(); | return m_commands.begin(); | ||||
}, cmd); | |||||
}, cmd.data); | |||||
} | } | ||||
/** | /** | ||||
@@ -1261,7 +1267,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) { | |||||
// TODO: eliminate Puts | // TODO: eliminate Puts | ||||
auto begin = m_commands.begin(), end = m_commands.end(); | auto begin = m_commands.begin(), end = m_commands.end(); | ||||
auto apply_iter = std::find_if(begin, end, [dest](const Command& cmd){ | auto apply_iter = std::find_if(begin, end, [dest](const Command& cmd){ | ||||
if (auto* apply = std::get_if<ApplyOp>(&cmd)) { | |||||
if (auto* apply = std::get_if<ApplyOp>(&cmd.data)) { | |||||
return std::count(apply->inputs.begin(), apply->inputs.end(), dest) > 0; | return std::count(apply->inputs.begin(), apply->inputs.end(), dest) > 0; | ||||
} | } | ||||
return false; | return false; | ||||
@@ -1270,7 +1276,7 @@ bool ChannelImpl::CommandBuffer::fuse_del(const Del& cmd) { | |||||
return false; | return false; | ||||
} | } | ||||
// mgb_log_debug("%s Fused", to_string(Command{cmd}).c_str()); | // mgb_log_debug("%s Fused", to_string(Command{cmd}).c_str()); | ||||
std::get<ApplyOp>(*apply_iter).dels.push_back(dest); | |||||
std::get<ApplyOp>(apply_iter->data).dels.push_back(dest); | |||||
return true; | return true; | ||||
} | } | ||||
@@ -1297,7 +1303,7 @@ auto ChannelImpl::CommandBuffer::find_last_usage(TensorInfo* dest, Range range) | |||||
found = iter; | found = iter; | ||||
} | } | ||||
} | } | ||||
}, *iter); | |||||
}, iter->data); | |||||
}; | }; | ||||
return found; | return found; | ||||
} | } | ||||
@@ -1313,7 +1319,7 @@ auto ChannelImpl::CommandBuffer::find_produce(TensorInfo* dest, Range range) | |||||
return cmd.dest == dest; | return cmd.dest == dest; | ||||
} | } | ||||
return false; | return false; | ||||
}, cmd); | |||||
}, cmd.data); | |||||
}); | }); | ||||
} | } | ||||
@@ -1340,7 +1346,7 @@ void ChannelImpl::push_scope(std::string name) { | |||||
MGB_LOCK_GUARD(m_spin); | MGB_LOCK_GUARD(m_spin); | ||||
mgb_assert(check_available(), "Channel already closed"); | mgb_assert(check_available(), "Channel already closed"); | ||||
auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
state.scopes.push(name); | |||||
state.stack_manager.enter(name); | |||||
RECORD_EVENT(ScopeEvent, name); | RECORD_EVENT(ScopeEvent, name); | ||||
m_buffer.enqueue(PushScope{name}); | m_buffer.enqueue(PushScope{name}); | ||||
} | } | ||||
@@ -1349,7 +1355,7 @@ void ChannelImpl::pop_scope(std::string name) { | |||||
MGB_LOCK_GUARD(m_spin); | MGB_LOCK_GUARD(m_spin); | ||||
mgb_assert(check_available(), "Channel already closed"); | mgb_assert(check_available(), "Channel already closed"); | ||||
auto& state = get_channel_state(); | auto& state = get_channel_state(); | ||||
state.scopes.pop(name); | |||||
state.stack_manager.exit(name); | |||||
RECORD_EVENT(ScopeFinishEvent, name); | RECORD_EVENT(ScopeFinishEvent, name); | ||||
m_buffer.enqueue(PopScope{name}); | m_buffer.enqueue(PopScope{name}); | ||||
} | } | ||||
@@ -26,6 +26,7 @@ | |||||
#include "./commands.h" | #include "./commands.h" | ||||
#include "./tensor_info.h" | #include "./tensor_info.h" | ||||
#include "./option_manager.h" | #include "./option_manager.h" | ||||
#include "./stack_manager.h" | |||||
#include "../profiler/events.h" | #include "../profiler/events.h" | ||||
@@ -94,7 +95,7 @@ private: | |||||
TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop); | TensorPtr wait_tensor(TensorInfo* info, profiler::TensorProp prop); | ||||
void notify_tensor_unsafe(TensorInfo* info); | void notify_tensor_unsafe(TensorInfo* info); | ||||
void process_one_task(IdentifiedCommand&); | |||||
void process_one_task(Command&); | |||||
void check_worker_exc_unsafe(); | void check_worker_exc_unsafe(); | ||||
@@ -129,10 +130,10 @@ private: | |||||
void assert_in_worker(); | void assert_in_worker(); | ||||
std::thread::id get_worker_tid(); | std::thread::id get_worker_tid(); | ||||
template <typename TCommand> | |||||
void enqueue_command(TCommand&& cmd) { | |||||
m_buffer.enqueue(Command{std::forward<TCommand>(cmd)}); | |||||
} | |||||
// template <typename TCommand> | |||||
// void enqueue_command(TCommand&& cmd) { | |||||
// m_buffer.enqueue(Command{std::forward<TCommand>(cmd)}); | |||||
// } | |||||
void sample_on_device(CompNode device, bool force); | void sample_on_device(CompNode device, bool force); | ||||
@@ -153,13 +154,13 @@ private: | |||||
bool m_applying = false; | bool m_applying = false; | ||||
bool m_closed = false; | bool m_closed = false; | ||||
struct WorkQueue : AsyncQueueSC<IdentifiedCommand, WorkQueue> { | |||||
struct WorkQueue : AsyncQueueSC<Command, WorkQueue> { | |||||
// set max_spin=0 to prevent Queue fetch task in busy wait manner. | // set max_spin=0 to prevent Queue fetch task in busy wait manner. | ||||
// this won't affect throughput when python interpreter is sending enough task, | // this won't affect throughput when python interpreter is sending enough task, | ||||
// but will significantly save CPU time when waiting for task, e.g. wait for data input | // but will significantly save CPU time when waiting for task, e.g. wait for data input | ||||
// limit pending tasks to 10000 | // limit pending tasks to 10000 | ||||
WorkQueue(ChannelImpl* owner) | WorkQueue(ChannelImpl* owner) | ||||
: AsyncQueueSC<IdentifiedCommand, WorkQueue>(0, 10000), m_owner(owner) { | |||||
: AsyncQueueSC<Command, WorkQueue>(0, 10000), m_owner(owner) { | |||||
sys::set_thread_name("interpreter"); | sys::set_thread_name("interpreter"); | ||||
if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) { | if (const char* env_val = MGB_GETENV("MEGENGINE_ASYNC_QUEUE_SIZE")) { | ||||
int len = strlen(env_val); | int len = strlen(env_val); | ||||
@@ -171,7 +172,7 @@ private: | |||||
update_max_items(val); | update_max_items(val); | ||||
} | } | ||||
} | } | ||||
void process_one_task(IdentifiedCommand& icmd) { | |||||
void process_one_task(Command& icmd) { | |||||
m_owner->process_one_task(icmd); | m_owner->process_one_task(icmd); | ||||
} | } | ||||
void on_async_queue_worker_thread_start() override; | void on_async_queue_worker_thread_start() override; | ||||
@@ -193,7 +194,7 @@ private: | |||||
*/ | */ | ||||
struct CommandBuffer { | struct CommandBuffer { | ||||
CommandBuffer(ChannelImpl* owner) : m_owner(owner) {} | CommandBuffer(ChannelImpl* owner) : m_owner(owner) {} | ||||
void enqueue(Command cmd); | |||||
void enqueue(CommandData cmd); | |||||
bool empty() const { | bool empty() const { | ||||
return m_commands.empty(); | return m_commands.empty(); | ||||
} | } | ||||
@@ -224,91 +225,13 @@ private: | |||||
//! level 0: both sync. | //! level 0: both sync. | ||||
int m_async_level = 2; | int m_async_level = 2; | ||||
struct Scope { | |||||
std::string name; | |||||
std::unordered_map<std::string, std::unique_ptr<Scope>> children; | |||||
size_t version = 0; | |||||
size_t parent_version = 0; | |||||
size_t tensor_count = 0; | |||||
Scope* active_child = nullptr; | |||||
Scope* parent = nullptr; | |||||
Scope* enter(std::string name) { | |||||
auto& child = children[name]; | |||||
if (!child) { | |||||
child = std::make_unique<Scope>(); | |||||
child->name = name; | |||||
child->parent = this; | |||||
} | |||||
if (version != child->parent_version) { | |||||
child->version = 0; | |||||
child->parent_version = version; | |||||
} else { | |||||
child->version++; | |||||
} | |||||
child->tensor_count = 0; | |||||
return active_child = child.get(); | |||||
} | |||||
Scope* exit(std::string name) { | |||||
mgb_assert(this->name == name, "scope name mismatch"); | |||||
parent->active_child = nullptr; | |||||
return parent; | |||||
} | |||||
}; | |||||
class ScopeManager { | |||||
private: | |||||
Scope m_root; | |||||
Scope* m_current_scope = &m_root; | |||||
public: | |||||
class ScopeGuard{ | |||||
private: | |||||
ScopeManager* m_manager; | |||||
std::string m_name; | |||||
public: | |||||
ScopeGuard(ScopeManager* manager, std::string name): m_manager{manager}, m_name{name} { | |||||
m_manager->push(m_name); | |||||
} | |||||
~ScopeGuard() { | |||||
m_manager->pop(m_name); | |||||
} | |||||
}; | |||||
void push(std::string name) { | |||||
m_current_scope = m_current_scope->enter(name); | |||||
} | |||||
void pop(std::string name) { | |||||
m_current_scope = m_current_scope->exit(name); | |||||
} | |||||
std::string next_tensor_name() { | |||||
std::string builder; | |||||
Scope* scope = &m_root; | |||||
while (true) { | |||||
builder.append(scope->name); | |||||
if (scope->version != 0) { | |||||
builder.append(ssprintf("(%ld)", scope->version)); | |||||
} | |||||
if (scope != &m_root) { | |||||
builder.append("."); | |||||
} | |||||
if (scope->active_child == nullptr) { | |||||
builder.append(ssprintf(":%%%ld", scope->tensor_count++)); | |||||
break; | |||||
} else { | |||||
scope = scope->active_child; | |||||
} | |||||
} | |||||
return builder; | |||||
} | |||||
}; | |||||
struct State { | struct State { | ||||
std::thread::id tid; | std::thread::id tid; | ||||
OptionManager options; | OptionManager options; | ||||
}; | }; | ||||
struct ChannelState: State { | struct ChannelState: State { | ||||
ScopeManager scopes; | |||||
StackManager stack_manager; | |||||
}; | }; | ||||
struct WorkerState: State {}; | struct WorkerState: State {}; | ||||
@@ -0,0 +1,188 @@ | |||||
/** | |||||
* \file imperative/src/impl/interpreter/stack_manager.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#pragma once | |||||
#include <string> | |||||
#include <memory> | |||||
#include <unordered_map> | |||||
#include "megbrain/utils/metahelper.h" | |||||
#include "megbrain/utils/small_vector.h" | |||||
namespace mgb::imperative::interpreter::intl{ | |||||
class StackSnapshot; | |||||
class StackManager: public NonCopyableObj { | |||||
public: | |||||
class Node; | |||||
class Guard; | |||||
struct Frame; | |||||
class Trace; | |||||
private: | |||||
std::unique_ptr<Node> m_root = nullptr; | |||||
Node* m_current = nullptr; | |||||
SmallVector<uint64_t> m_trace_id_stack; | |||||
uint64_t m_last_trace_id = 0; | |||||
public: | |||||
StackManager(); | |||||
std::pair<Node*, uint64_t> enter(std::string name); | |||||
void exit(std::string name); | |||||
Trace dump(); | |||||
Node* current(); | |||||
}; | |||||
class StackManager::Node: public NonCopyableObj { | |||||
private: | |||||
std::string m_name; | |||||
std::unordered_map<std::string, std::unique_ptr<Node>> m_children; | |||||
std::unordered_map<std::string, size_t> m_id_table; | |||||
Node* m_parent = nullptr; | |||||
int64_t m_depth = -1; | |||||
uint64_t m_version = 0; | |||||
explicit Node(std::string name, Node* parent): m_name{name}, m_parent{parent} { | |||||
if (parent) { | |||||
m_depth = parent->m_depth + 1; | |||||
} | |||||
} | |||||
public: | |||||
const std::string& name() const { | |||||
return m_name; | |||||
} | |||||
Node* operator[](const std::string& name) { | |||||
auto& child = m_children[name]; | |||||
if (child == nullptr) { | |||||
child.reset(new Node(name, this)); | |||||
} | |||||
return child.get(); | |||||
} | |||||
Node* parent() { | |||||
return m_parent; | |||||
} | |||||
bool is_root() { | |||||
return m_parent == nullptr; | |||||
} | |||||
uint64_t version() const { | |||||
return m_version; | |||||
} | |||||
void update_version() { | |||||
++m_version; | |||||
for (auto&& [key, child]: m_children) { | |||||
child->reset_version(); | |||||
} | |||||
m_id_table.clear(); | |||||
} | |||||
void reset_version() { | |||||
m_version = 0; | |||||
m_id_table.clear(); | |||||
} | |||||
int64_t depth() const { | |||||
return m_depth; | |||||
} | |||||
uint64_t next_id(std::string key) { | |||||
return m_id_table[key]++; | |||||
} | |||||
static std::unique_ptr<Node> make() { | |||||
return std::unique_ptr<Node>(new Node("", nullptr)); | |||||
} | |||||
}; | |||||
class StackManager::Guard { | |||||
private: | |||||
std::string m_name; | |||||
StackManager* m_manager; | |||||
public: | |||||
Guard(std::string name, StackManager* manager): m_name{name}, m_manager{manager}{ | |||||
if (m_manager) { | |||||
m_manager->enter(name); | |||||
} | |||||
} | |||||
~Guard() { | |||||
release(); | |||||
} | |||||
void release() { | |||||
if (m_manager) { | |||||
m_manager->exit(m_name); | |||||
m_manager = nullptr; | |||||
} | |||||
} | |||||
}; | |||||
struct StackManager::Frame { | |||||
StackManager::Node* node; | |||||
uint64_t version; | |||||
}; | |||||
class StackManager::Trace { | |||||
private: | |||||
SmallVector<StackManager::Frame> m_frames; | |||||
uint64_t m_id = 0; | |||||
public: | |||||
explicit Trace(StackManager::Node* top, uint64_t id): m_id{id} { | |||||
int64_t nr_frames = top->depth() + 1; | |||||
m_frames = SmallVector<StackManager::Frame>(nr_frames); | |||||
StackManager::Node* node = top; | |||||
for (int64_t i = 0; i < nr_frames; ++i) { | |||||
m_frames[m_frames.size()-1-i] = {node, node->version()}; | |||||
node = node->parent(); | |||||
} | |||||
mgb_assert(node->is_root() , ""); | |||||
} | |||||
Trace() = default; | |||||
std::string to_string() const { | |||||
std::string buffer; | |||||
for (auto&& [node, version]: m_frames) { | |||||
if (!buffer.empty()) { | |||||
buffer.append("."); | |||||
} | |||||
buffer.append(node->name()); | |||||
if (version != 0) { | |||||
buffer.append(ssprintf("[%zu]", version)); | |||||
} | |||||
} | |||||
return buffer; | |||||
} | |||||
const SmallVector<StackManager::Frame>& frames() const { | |||||
return m_frames; | |||||
} | |||||
uint64_t id() const { | |||||
return m_id; | |||||
} | |||||
}; | |||||
inline StackManager::StackManager() { | |||||
m_root = Node::make(); | |||||
m_current = m_root.get(); | |||||
} | |||||
inline std::pair<StackManager::Node*, uint64_t> StackManager::enter(std::string name) { | |||||
m_current = (*m_current)[name]; | |||||
m_trace_id_stack.push_back(++m_last_trace_id); | |||||
return {m_current, m_current->version()}; | |||||
} | |||||
inline void StackManager::exit(std::string name) { | |||||
mgb_assert(m_current->name() == name, "scope name mismatch"); | |||||
m_current = m_current->parent(); | |||||
m_trace_id_stack.pop_back(); | |||||
m_current->update_version(); | |||||
} | |||||
inline StackManager::Trace StackManager::dump() { | |||||
return Trace(m_current, m_trace_id_stack.empty() ? 0 : m_trace_id_stack.back()); | |||||
} | |||||
inline StackManager::Node* StackManager::current() { | |||||
return m_current; | |||||
} | |||||
} |
@@ -102,7 +102,8 @@ std::vector<std::pair<const char*, std::string>> props(const OpDef& def) { | |||||
} | } | ||||
std::string make_name(const OpDef& def) { | std::string make_name(const OpDef& def) { | ||||
return "OprAttr"; | |||||
auto&& attr = def.cast_final_safe<OprAttr>(); | |||||
return attr.type; | |||||
} | } | ||||
OP_TRAIT_REG(OprAttr, OprAttr) | OP_TRAIT_REG(OprAttr, OprAttr) | ||||
@@ -27,25 +27,11 @@ | |||||
namespace mgb { | namespace mgb { | ||||
namespace imperative { | namespace imperative { | ||||
uint64_t Timer::get_nsecs() { | |||||
using namespace std::chrono; | |||||
auto finish = steady_clock::now(); | |||||
auto duration = duration_cast<nanoseconds>(finish - m_start); | |||||
return duration.count(); | |||||
profiler::Time Timer::record_host() { | |||||
return std::chrono::high_resolution_clock::now(); | |||||
} | } | ||||
uint64_t Timer::get_started_at() { | |||||
return m_started_at; | |||||
} | |||||
void Timer::reset() { | |||||
using namespace std::chrono; | |||||
m_start = steady_clock::now(); | |||||
auto now_ns = duration_cast<nanoseconds>(std::chrono::system_clock::now().time_since_epoch()); | |||||
m_started_at = now_ns.count(); | |||||
} | |||||
std::shared_ptr<CompNode::Event> Timer::record_event(CompNode device) { | |||||
std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) { | |||||
auto event = EventPool::with_timer().alloc_shared(device); | auto event = EventPool::with_timer().alloc_shared(device); | ||||
event->record(); | event->record(); | ||||
return event; | return event; | ||||
@@ -55,13 +41,13 @@ Profiler::options_t Profiler::sm_profile_options; | |||||
std::mutex Profiler::sm_mutex; | std::mutex Profiler::sm_mutex; | ||||
std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers; | std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers; | ||||
Timer Profiler::sm_timer; | Timer Profiler::sm_timer; | ||||
profiler::HostTime Profiler::sm_start_at; | |||||
std::atomic_uint64_t Profiler::sm_last_id = 0; | std::atomic_uint64_t Profiler::sm_last_id = 0; | ||||
bool Profiler::sm_profiling = false; | bool Profiler::sm_profiling = false; | ||||
thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>(); | thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>(); | ||||
std::atomic_size_t Profiler::sm_preferred_capacity; | std::atomic_size_t Profiler::sm_preferred_capacity; | ||||
auto Profiler::get_thread_dict() -> thread_dict_t { | auto Profiler::get_thread_dict() -> thread_dict_t { | ||||
MGB_LOCK_GUARD(sm_mutex); | |||||
thread_dict_t thread_dict; | thread_dict_t thread_dict; | ||||
for (auto&& [tid, profiler]: sm_profilers) { | for (auto&& [tid, profiler]: sm_profilers) { | ||||
thread_dict[tid] = profiler->m_thread_name; | thread_dict[tid] = profiler->m_thread_name; | ||||
@@ -69,15 +55,13 @@ auto Profiler::get_thread_dict() -> thread_dict_t { | |||||
return thread_dict; | return thread_dict; | ||||
} | } | ||||
void Profiler::dump_profile(std::string basename, std::string format, results_t results, options_t options) { | |||||
auto thread_dict = get_thread_dict(); | |||||
if (format == "chrome_timeline.json") { | |||||
profiler::dump_chrome_timeline(basename, options, thread_dict, results); | |||||
} else if (format == "memory_flow.svg") { | |||||
profiler::dump_memory_flow(basename, options, thread_dict, results); | |||||
} else { | |||||
void Profiler::dump_profile(std::string basename, std::string format, bundle_t result) { | |||||
std::unordered_map<std::string, void(*)(std::string, bundle_t)> format_table; | |||||
auto iter = format_table.find(format); | |||||
if (iter == format_table.end()) { | |||||
mgb_log_error("unsupported profiling format %s", format.c_str()); | mgb_log_error("unsupported profiling format %s", format.c_str()); | ||||
} | } | ||||
return (iter->second)(basename, result); | |||||
} | } | ||||
} // namespace imperative | } // namespace imperative | ||||
@@ -17,6 +17,10 @@ | |||||
#error Unsupported platform | #error Unsupported platform | ||||
#endif | #endif | ||||
#include "nlohmann/json.hpp" | |||||
#include "megbrain/utils/debug.h" | |||||
#include "./formats.h" | #include "./formats.h" | ||||
#include "./states.h" | #include "./states.h" | ||||
@@ -53,8 +57,9 @@ public: | |||||
m_idx = idx; | m_idx = idx; | ||||
return *this; | return *this; | ||||
} | } | ||||
ChromeTraceEvent& ts(uint64_t ts) { | |||||
m_ts = ts; | |||||
template <typename TDuration> | |||||
ChromeTraceEvent& ts(TDuration ts) { | |||||
m_ts = std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(ts).count(); | |||||
return *this; | return *this; | ||||
} | } | ||||
ChromeTraceEvent& dur(uint64_t dur) { | ChromeTraceEvent& dur(uint64_t dur) { | ||||
@@ -69,51 +74,46 @@ public: | |||||
m_bp = bp; | m_bp = bp; | ||||
return *this; | return *this; | ||||
} | } | ||||
ChromeTraceEvent& args(std::shared_ptr<json::Object> args) { | |||||
ChromeTraceEvent& args(nlohmann::json args) { | |||||
m_args = std::move(args); | m_args = std::move(args); | ||||
return *this; | return *this; | ||||
} | } | ||||
ChromeTraceEvent& arg(std::string key, std::string value) { | ChromeTraceEvent& arg(std::string key, std::string value) { | ||||
if (!m_args) { | |||||
m_args = json::Object::make(); | |||||
} | |||||
(*m_args)[key] = json::String::make(value); | |||||
m_args[key] = value; | |||||
return *this; | return *this; | ||||
} | } | ||||
ChromeTraceEvent& arg(std::string key, double value) { | ChromeTraceEvent& arg(std::string key, double value) { | ||||
if (!m_args) { | |||||
m_args = json::Object::make(); | |||||
} | |||||
(*m_args)[key] = json::Number::make(value); | |||||
m_args[key] = value; | |||||
return *this; | return *this; | ||||
} | } | ||||
ChromeTraceEvent& arg(std::string key, std::shared_ptr<json::Value> value) { | |||||
if (!m_args) { | |||||
m_args = json::Object::make(); | |||||
} | |||||
(*m_args)[key] = value; | |||||
ChromeTraceEvent& arg(std::string key, nlohmann::json value) { | |||||
m_args[key] = value; | |||||
return *this; | |||||
} | |||||
ChromeTraceEvent& stack(Trace trace) { | |||||
m_stack = std::move(trace); | |||||
return *this; | return *this; | ||||
} | } | ||||
std::shared_ptr<json::Object> to_json() const { | |||||
auto result = json::Object::make(); | |||||
nlohmann::json to_json() const { | |||||
nlohmann::json result; | |||||
auto prop_str = [&](auto key, auto value) { | auto prop_str = [&](auto key, auto value) { | ||||
if (value.empty()) { | if (value.empty()) { | ||||
return; | return; | ||||
} | } | ||||
(*result)[key] = json::String::make(value); | |||||
result[key] = value; | |||||
}; | }; | ||||
auto prop_num = [&](auto key, auto value) { | auto prop_num = [&](auto key, auto value) { | ||||
if (!value) { | if (!value) { | ||||
return; | return; | ||||
} | } | ||||
(*result)[key] = json::Number::make(value.value()); | |||||
result[key] = value.value(); | |||||
}; | }; | ||||
auto prop_char = [&](auto key, auto value) { | auto prop_char = [&](auto key, auto value) { | ||||
if (!value) { | if (!value) { | ||||
return; | return; | ||||
} | } | ||||
(*result)[key] = json::String::make(std::string{} + value.value()); | |||||
result[key] = std::string{} + value.value(); | |||||
}; | }; | ||||
prop_str("name", m_name); | prop_str("name", m_name); | ||||
prop_str("cat", m_cat); | prop_str("cat", m_cat); | ||||
@@ -126,8 +126,16 @@ public: | |||||
prop_num("dur", m_dur); | prop_num("dur", m_dur); | ||||
prop_char("ph", m_ph); | prop_char("ph", m_ph); | ||||
prop_char("bp", m_bp); | prop_char("bp", m_bp); | ||||
if (m_args) { | |||||
(*result)["args"] = m_args; | |||||
if (!m_args.empty()) { | |||||
result["args"] = m_args; | |||||
} | |||||
if (m_stack) { | |||||
nlohmann::json stack; | |||||
for (auto&& frame: m_stack->frames()) { | |||||
stack.push_back(ssprintf("%s%ld", frame.node->name().c_str(), frame.version)); | |||||
} | |||||
std::reverse(stack.begin(), stack.end()); | |||||
result["stack"] = stack; | |||||
} | } | ||||
return result; | return result; | ||||
} | } | ||||
@@ -140,11 +148,12 @@ private: | |||||
std::optional<uint64_t> m_pid; | std::optional<uint64_t> m_pid; | ||||
std::optional<uint64_t> m_id; | std::optional<uint64_t> m_id; | ||||
std::optional<uint64_t> m_idx; | std::optional<uint64_t> m_idx; | ||||
std::optional<uint64_t> m_ts; | |||||
std::optional<double> m_ts; | |||||
std::optional<uint64_t> m_dur; | std::optional<uint64_t> m_dur; | ||||
std::optional<char> m_ph; | std::optional<char> m_ph; | ||||
std::optional<char> m_bp; | std::optional<char> m_bp; | ||||
std::shared_ptr<json::Object> m_args; | |||||
nlohmann::json m_args; | |||||
std::optional<Trace> m_stack; | |||||
}; | }; | ||||
class ChromeTraceEvents { | class ChromeTraceEvents { | ||||
@@ -154,368 +163,218 @@ public: | |||||
return m_content.back(); | return m_content.back(); | ||||
} | } | ||||
std::shared_ptr<json::Value> to_json() const { | |||||
auto result = json::Object::make(); | |||||
auto event_list = json::Array::make(); | |||||
std::string metadata(std::string key) { | |||||
return m_metadata[key]; | |||||
} | |||||
nlohmann::json to_json() const { | |||||
nlohmann::json result; | |||||
nlohmann::json event_list; | |||||
nlohmann::json metadata; | |||||
for (auto&& event: m_content) { | for (auto&& event: m_content) { | ||||
event_list->add(event.to_json()); | |||||
event_list.push_back(event.to_json()); | |||||
} | |||||
for (auto&& [key, value]: m_metadata) { | |||||
metadata[key] = value; | |||||
} | } | ||||
(*result)["traceEvents"] = event_list; | |||||
result["traceEvents"] = event_list; | |||||
result["metadata"] = metadata; | |||||
return result; | return result; | ||||
} | } | ||||
std::string to_string() const { | |||||
auto json = to_json(); | |||||
return "{" "traceEvents:" + nlohmann::to_string(json["traceEvents"]) + "," | |||||
"metadata:" + nlohmann::to_string(json["metadata"]) + "}"; | |||||
} | |||||
private: | private: | ||||
std::vector<ChromeTraceEvent> m_content; | std::vector<ChromeTraceEvent> m_content; | ||||
std::unordered_map<std::string, std::string> m_metadata; | |||||
}; | }; | ||||
void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results){ | |||||
auto pid = getpid(); | |||||
ProfileDataCollector collector; | |||||
ProfileState state; | |||||
#define HANDLE_EVENT(type, ...) \ | |||||
collector.handle<type>([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ ); | |||||
struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||||
ChromeTraceEvents trace_events; | ChromeTraceEvents trace_events; | ||||
decltype(getpid()) pid = getpid(); | |||||
std::string pid_str = std::to_string(pid); | |||||
#define NEW_HOST(NAME, PH) trace_events.new_event().name(NAME).pid(pid).tid(state[tid].index).ph(PH).ts((double)time/1e3) | |||||
#define NEW_DEVICE(NAME, PH) trace_events.new_event().name(NAME).pid(pid).tid(256+state[event.event->comp_node()].index).ph(PH).ts((double)get_device_time(event.event, time)/1e3) | |||||
#define OP_NAME op_state.name | |||||
#define OP_KERNEL_NAME (op_state.name + "") | |||||
#define OP_PROPS get_op_args(op_state) | |||||
#define OP_ID event.op_id | |||||
#define TENSOR_PROPS get_tensor_args(tensor_state, time) | |||||
#define TENSOR_INFO get_tensor_info(tensor_state, time) | |||||
#define TENSOR_COMMAND_KIND print_tensor_command_kind(event.kind) | |||||
#define HANDLE_PLAIN_EVENT(START, FINISH, NAME_EXPR)\ | |||||
HANDLE_EVENT(START, { NEW_HOST(NAME_EXPR, 'B'); })\ | |||||
HANDLE_EVENT(FINISH, { NEW_HOST(NAME_EXPR, 'E'); }) | |||||
#define HANDLE_TENSOR_EVENT(START, FINISH, NAME_EXPR)\ | |||||
HANDLE_EVENT(START, { NEW_HOST(NAME_EXPR, 'B'); })\ | |||||
HANDLE_EVENT(FINISH, { auto& tensor_state = state.tensors[event.tensor_id]; NEW_HOST(NAME_EXPR, 'E').args(TENSOR_PROPS); }) | |||||
#define INC_COUNTER(NAME, DELTA)\ | |||||
{ state.statics.NAME += DELTA; NEW_HOST(#NAME, 'C').arg(#NAME, state.statics.NAME); } | |||||
auto get_tensor_args = [](const ProfileTensorState& tensor, uint64_t time) -> std::shared_ptr<json::Object> { | |||||
auto args = json::Object::make(); | |||||
(*args)["id"] = json::Number::make(tensor.id); | |||||
(*args)["name"] = json::String::make(tensor.name); | |||||
(*args)["shape"] = json::String::make(tensor.layout.TensorShape::to_string()); | |||||
(*args)["dtype"] = json::String::make(tensor.layout.dtype.name()); | |||||
(*args)["nr_elements"] = json::Number::make(tensor.layout.total_nr_elems()); | |||||
(*args)["device"] = json::String::make(tensor.device.to_string()); | |||||
if (tensor.produced) { | |||||
(*args)["living_time"] = json::String::make(std::to_string((time - tensor.produced + tensor.living_time)/1e6) + "ms"); | |||||
} | |||||
return args; | |||||
}; | |||||
auto get_tensor_info = [](const ProfileTensorState& tensor, uint64_t time) -> std::string { | |||||
std::string name = tensor.name; | |||||
std::string shape = tensor.layout.TensorShape::to_string(); | |||||
std::string size_in_bytes = std::to_string(tensor.size_in_bytes()); | |||||
std::string device = tensor.device.to_string(); | |||||
std::string dtype = tensor.layout.dtype.name(); | |||||
return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.c_str()); | |||||
}; | |||||
ChromeTraceEvent& new_event(std::string name, char ph, size_t tid, profiler::HostTime time) { | |||||
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts(since_start(time)); | |||||
} | |||||
auto get_op_args = [&](const ProfileOperatorState& op) -> std::shared_ptr<json::Object> { | |||||
auto args = json::Object::make(); | |||||
auto params = op.params; | |||||
for (auto&& [name, value]: params) { | |||||
(*args)[name] = json::String::make(value); | |||||
} | |||||
(*args)["__id__"] = json::Number::make(op.id); | |||||
(*args)["__name__"] = json::String::make(op.name); | |||||
(*args)["__device__"] = json::String::make(op.device.to_string()); | |||||
return args; | |||||
ChromeTraceEvent& new_host_event(std::string name, char ph) { | |||||
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(current->tid)).ts(since_start(current->time)); | |||||
}; | }; | ||||
auto get_device_time = [&](const std::shared_ptr<CompNode::Event>& event, uint64_t host) -> uint64_t { | |||||
event->host_wait(); | |||||
auto& device_state = state.devices[event->comp_node()]; | |||||
if (!device_state.base_event) { | |||||
device_state.base_event = event; | |||||
device_state.base_time = host; | |||||
return host; | |||||
} | |||||
uint64_t device = device_state.base_event->elapsed_time_until(*event) * 1e9 + device_state.base_time; | |||||
return std::max(device, host); | |||||
ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | |||||
using namespace std::literals::chrono_literals; | |||||
auto time = since_start(to_device_time(current->time, device)); | |||||
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(to_tid(device)).ts(time); | |||||
}; | }; | ||||
auto print_tensor_command_kind = [&](int kind) -> const char* { | |||||
const char* to_cstr(TensorCommandKind kind) { | |||||
switch(kind) { | switch(kind) { | ||||
case TensorCommandEvent::Put: | |||||
case TensorCommandKind::Put: | |||||
return "Put"; | return "Put"; | ||||
case TensorCommandEvent::Drop: | |||||
case TensorCommandKind::Drop: | |||||
return "Drop"; | return "Drop"; | ||||
case TensorCommandEvent::Del: | |||||
case TensorCommandKind::Del: | |||||
return "Del"; | return "Del"; | ||||
case TensorCommandEvent::SwapIn: | |||||
case TensorCommandKind::SwapIn: | |||||
return "SwapIn"; | return "SwapIn"; | ||||
case TensorCommandEvent::SwapOut: | |||||
case TensorCommandKind::SwapOut: | |||||
return "SwapOut"; | return "SwapOut"; | ||||
case TensorCommandEvent::RecFree: | |||||
case TensorCommandKind::RecFree: | |||||
return "RecFree"; | return "RecFree"; | ||||
case TensorCommandEvent::ReGen: | |||||
case TensorCommandKind::ReGen: | |||||
return "ReGen"; | return "ReGen"; | ||||
case TensorCommandKind::GetValue: | |||||
return "GetValue"; | |||||
} | } | ||||
return "UnknownCommand"; | return "UnknownCommand"; | ||||
}; | |||||
HANDLE_EVENT(OpDispatchEvent, { | |||||
auto& op_state = state.operators[OP_ID] = {}; | |||||
op_state.id = OP_ID; | |||||
op_state.name = event.op_name; | |||||
op_state.params = event.op_params(); | |||||
op_state.inputs = event.inputs; | |||||
op_state.outputs = event.outputs; | |||||
NEW_HOST("OpDispatch", 'B'); | |||||
NEW_HOST(ssprintf("%d", pid), 's') | |||||
.cat("OpDispatch") | |||||
.id(OP_ID) | |||||
.scope(std::to_string(pid)); | |||||
NEW_HOST("OpDispatch", 'E').args(OP_PROPS); | |||||
INC_COUNTER(op_enqueue_count, 1); | |||||
}); | |||||
HANDLE_EVENT(OpExecuteEvent, { | |||||
mgb_assert(OP_ID != 0); | |||||
mgb_assert(state.operators.count(OP_ID) > 0); | |||||
auto& op_state = state.operators[OP_ID]; | |||||
op_state.host_begin = time; | |||||
NEW_HOST(OP_NAME, 'B'); | |||||
//.args(OP_PROPS); | |||||
NEW_HOST(ssprintf("%d", pid), 't') | |||||
.cat("OpDispatch") | |||||
.id(OP_ID) | |||||
.scope(std::to_string(pid)); | |||||
INC_COUNTER(op_execute_count, 1); | |||||
}); | |||||
HANDLE_EVENT(OpExecuteFinishEvent, { | |||||
auto& op_state = state.operators[event.op_id]; | |||||
op_state.host_end = time; | |||||
NEW_HOST(OP_NAME, 'E') | |||||
.args(OP_PROPS); | |||||
}); | |||||
HANDLE_EVENT(KernelExecuteEvent, { | |||||
auto& op_state = state.operators[event.op_id]; | |||||
op_state.device_begin = event.event; | |||||
NEW_HOST(ssprintf("%d", pid), 's') | |||||
.id(event.kernel_id) | |||||
.cat("KernelLaunch") | |||||
.scope(std::to_string(pid)); | |||||
NEW_DEVICE(OP_KERNEL_NAME, 'B') | |||||
.cat("Kernel"); | |||||
//.args(OP_PROPS); | |||||
NEW_DEVICE(ssprintf("%d", pid), 'f') | |||||
.id(event.kernel_id) | |||||
.bp('e') | |||||
.cat("KernelLaunch") | |||||
.scope(std::to_string(pid)); | |||||
}); | |||||
HANDLE_EVENT(KernelExecuteFinishEvent, { | |||||
auto& op_state = state.operators[event.op_id]; | |||||
op_state.device_end = event.event; | |||||
NEW_DEVICE(OP_KERNEL_NAME, 'E') | |||||
.cat("Kernel") | |||||
.args(OP_PROPS); | |||||
}); | |||||
HANDLE_EVENT(TensorDeclareEvent, { | |||||
auto& tensor_state = state.tensors[event.tensor_id] = {}; | |||||
tensor_state.id = event.tensor_id; | |||||
tensor_state.name = event.name; | |||||
}); | |||||
HANDLE_EVENT(TensorProduceEvent, { | |||||
auto& tensor_state = state.tensors[event.tensor_id]; | |||||
tensor_state.device = event.device; | |||||
tensor_state.layout = event.layout; | |||||
tensor_state.produced = time; | |||||
if (!tensor_state.living_time) { | |||||
NEW_HOST(ssprintf("%d", pid), 's') | |||||
.id(event.tensor_id) | |||||
.cat("TensorLink") | |||||
.scope(std::to_string(pid)); | |||||
} else { | |||||
NEW_HOST(ssprintf("%d", pid), 't') | |||||
.id(event.tensor_id) | |||||
.cat("TensorLink") | |||||
.scope(std::to_string(pid)); | |||||
} | |||||
INC_COUNTER(alive_tensor_count, 1); | |||||
INC_COUNTER(produce_tensor_count, 1); | |||||
state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()}); | |||||
state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced}); | |||||
}); | |||||
HANDLE_EVENT(TensorUsageEvent, { | |||||
NEW_HOST(ssprintf("%d", pid), 't') | |||||
.id(event.tensor_id) | |||||
.cat("TensorLink") | |||||
.scope(std::to_string(pid)); | |||||
}); | |||||
HANDLE_EVENT(TensorReleaseEvent, { | |||||
auto& tensor_state = state.tensors[event.tensor_id]; | |||||
tensor_state.living_time += time - tensor_state.produced; | |||||
tensor_state.produced = 0; | |||||
INC_COUNTER(alive_tensor_count, -1); | |||||
INC_COUNTER(erase_tensor_count, 1); | |||||
state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()}); | |||||
state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced}); | |||||
NEW_HOST(ssprintf("%d", pid), 't') | |||||
.id(event.tensor_id) | |||||
.cat("TensorLink") | |||||
.scope(std::to_string(pid)); | |||||
}); | |||||
HANDLE_EVENT(TensorEraseEvent, { | |||||
auto& tensor_state = state.tensors[event.tensor_id]; | |||||
if (tensor_state.living_time) { | |||||
NEW_HOST(ssprintf("%d", pid), 'f') | |||||
.id(event.tensor_id) | |||||
.bp('e') | |||||
.cat("TensorLink") | |||||
.scope(std::to_string(pid)); | |||||
} | |||||
if (event.use_count == 0) { | |||||
INC_COUNTER(redundant_tensor_count, 1); | |||||
} | |||||
}); | |||||
HANDLE_EVENT(TensorGetPropEvent, { | |||||
auto& tensor_state = state.tensors[event.tensor_id]; | |||||
NEW_HOST("TensorGetProp", 'X') | |||||
.dur(0).args(TENSOR_PROPS); | |||||
}); | |||||
HANDLE_EVENT(TensorWaitPropEvent, { | |||||
NEW_HOST("TensorWaitProp", 'B'); | |||||
if (event.prop == TensorProp::HostValue) { | |||||
INC_COUNTER(wait_value_count, 1); | |||||
} else if (event.prop == TensorProp::Shape) { | |||||
INC_COUNTER(wait_shape_count, 1); | |||||
} | |||||
INC_COUNTER(wait_prop_count, 1); | |||||
}); | |||||
HANDLE_EVENT(TensorWaitPropFinishEvent, { | |||||
auto& tensor_state = state.tensors[event.tensor_id]; | |||||
if (event.notified) { | |||||
NEW_HOST(ssprintf("%d", pid), 'f') | |||||
.id(event.tensor_id) | |||||
.bp('e') | |||||
.cat("TensorProp") | |||||
.scope(std::to_string(pid)); | |||||
} | |||||
NEW_HOST("TensorWaitProp", 'E') | |||||
.args(TENSOR_PROPS); | |||||
}); | |||||
HANDLE_EVENT(TensorNotifyPropEvent, { | |||||
NEW_HOST(ssprintf("%d", pid), 's') | |||||
.id(event.tensor_id) | |||||
.cat("TensorProp") | |||||
.scope(std::to_string(pid)); | |||||
}); | |||||
HANDLE_EVENT(ShapeInferEvent, { | |||||
if (event.success) { | |||||
INC_COUNTER(infer_shape_valid_count, 1); | |||||
} else { | |||||
INC_COUNTER(infer_shape_invalid_count, 1); | |||||
} | |||||
}); | |||||
HANDLE_EVENT(SampleDeviceEvent, { | |||||
NEW_HOST("TopKTensor", 'B'); | |||||
}); | |||||
HANDLE_EVENT(SampleDeviceFinishEvent, { | |||||
std::string device_name = event.device.locator().to_string(); | |||||
std::string prop_name = ssprintf("%s_alloc_memory", device_name.c_str()); | |||||
NEW_HOST(prop_name, 'C') | |||||
.arg(prop_name, event.total_memory - event.free_memory); | |||||
auto top_k_tensors = state.top_k_tensor_in_device(event.device, options.at("num_tensor_watch")); | |||||
auto& top_k_event = NEW_HOST("TopKTensor", 'E'); | |||||
for (size_t i = 0; i < top_k_tensors.size(); ++i) { | |||||
auto tensor_id = top_k_tensors[i]; | |||||
auto& tensor_state = state.tensors[tensor_id]; | |||||
top_k_event.arg(ssprintf("top%03d", (int)i), TENSOR_INFO); //%03d is always enough | |||||
} | |||||
}); | |||||
HANDLE_EVENT(WorkerExceptionEvent, { | |||||
INC_COUNTER(exception_count, 1); | |||||
}); | |||||
HANDLE_EVENT(TensorCommandEvent, { | |||||
NEW_HOST(ssprintf("%s %zu", TENSOR_COMMAND_KIND, event.tensor_id), 'B'); | |||||
}); | |||||
HANDLE_EVENT(TensorCommandFinishEvent, { | |||||
auto& tensor_state = state.tensors[event.tensor_id]; | |||||
NEW_HOST(ssprintf("%s %zu", TENSOR_COMMAND_KIND, event.tensor_id), 'E') | |||||
.args(TENSOR_PROPS); | |||||
}); | |||||
HANDLE_EVENT(ScopeEvent, { | |||||
NEW_HOST(event.name, 'B'); | |||||
state.threads[tid].scope_stack.push_back(event.name); | |||||
}); | |||||
HANDLE_EVENT(ScopeFinishEvent, { | |||||
NEW_HOST(event.name, 'E'); | |||||
mgb_assert(state.threads[tid].scope_stack.back() == event.name); | |||||
state.threads[tid].scope_stack.pop_back(); | |||||
}); | |||||
HANDLE_TENSOR_EVENT(OpInputEvent, OpInputFinishEvent, ssprintf("Input %zu", event.tensor_id)); | |||||
HANDLE_TENSOR_EVENT(OpOutputEvent, OpOutputFinishEvent, ssprintf("Output %zu", event.tensor_id)); | |||||
HANDLE_TENSOR_EVENT(OpDelEvent, OpDelFinishEvent, ssprintf("Del %zu", event.tensor_id)); | |||||
HANDLE_PLAIN_EVENT(StartProfileEvent, StartProfileFinishEvent, "StartProfile"); | |||||
HANDLE_PLAIN_EVENT(StopProfileEvent, StopProfileFinishEvent, "StopProfile"); | |||||
HANDLE_PLAIN_EVENT(CustomEvent, CustomFinishEvent, event.title); | |||||
HANDLE_PLAIN_EVENT(AutoEvictEvent, AutoEvictFinishEvent, "AutoEvict"); | |||||
if (results.size() > 0) { | |||||
uint64_t time = results[0].second.time; | |||||
trace_events.new_event().name("Metadata").ph('I').pid(pid).ts(0).arg("localTime", time/1e3); | |||||
} | } | ||||
for (auto&& result: results) { | |||||
collector(result.second.id, result.first, result.second.time, result.second.data); | |||||
} | |||||
for (auto&& [tid, thread]: state.threads) { | |||||
if (!thread_dict.count(tid)) { | |||||
continue; | |||||
template <typename TEvent> | |||||
void visit_event(const TEvent &event) { | |||||
if constexpr (std::is_same_v<TEvent, OpDispatchEvent>) { | |||||
new_host_event("OpDispatch", 'B'); | |||||
new_host_event(pid_str, 's') | |||||
.cat("OpDispatch") | |||||
.id(event.op_id) | |||||
.scope(pid_str); | |||||
new_host_event("OpDispatch", 'E').args(current_op->detail()); | |||||
} else if constexpr (std::is_same_v<TEvent, OpExecuteEvent>) { | |||||
mgb_assert(event.op_id != 0); | |||||
current_op->execute_begin = current->time; | |||||
new_host_event(current_op->name, 'B'); | |||||
new_host_event(pid_str, 't') | |||||
.cat("OpDispatch") | |||||
.id(current_op->id) | |||||
.scope(pid_str); | |||||
} else if constexpr (std::is_same_v<TEvent, OpExecuteFinishEvent>) { | |||||
current_op->execute_end = current->time; | |||||
new_host_event(current_op->name, 'E') | |||||
.args(current_op->detail()); | |||||
} else if constexpr (std::is_same_v<TEvent, KernelLaunchEvent>) { | |||||
new_host_event(pid_str, 's') | |||||
.id(event.kernel_id) | |||||
.cat("KernelLaunch") | |||||
.scope(pid_str); | |||||
new_device_event(current_op->name, 'B', event.device) | |||||
.cat("Kernel"); | |||||
new_device_event(pid_str, 'f', event.device) | |||||
.id(event.kernel_id) | |||||
.bp('e') | |||||
.cat("KernelLaunch") | |||||
.scope(pid_str); | |||||
} else if constexpr (std::is_same_v<TEvent, KernelLaunchFinishEvent>) { | |||||
new_device_event(current_op->name, 'E', event.device) | |||||
.cat("Kernel") | |||||
.args(current_op->detail()); | |||||
} else if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) { | |||||
if (current_tensor->living_time != profiler::Duration::zero()) { | |||||
new_host_event(pid_str, 's') | |||||
.id(event.tensor_id) | |||||
.cat("TensorLink") | |||||
.scope(pid_str); | |||||
} else { | |||||
new_host_event(pid_str, 't') | |||||
.id(event.tensor_id) | |||||
.cat("TensorLink") | |||||
.scope(pid_str); | |||||
} | |||||
} else if constexpr (std::is_same_v<TEvent, TensorUsageEvent>) { | |||||
new_host_event(pid_str, 't') | |||||
.id(event.tensor_id) | |||||
.cat("TensorLink") | |||||
.scope(pid_str); | |||||
} else if constexpr (std::is_same_v<TEvent, TensorReleaseEvent>) { | |||||
current_tensor->living_time += current->time - current_tensor->produced; | |||||
current_tensor->produced = {}; | |||||
new_host_event(pid_str, 't') | |||||
.id(event.tensor_id) | |||||
.cat("TensorLink") | |||||
.scope(pid_str); | |||||
} else if constexpr (std::is_same_v<TEvent, TensorEraseEvent>) { | |||||
if (current_tensor->living_time != profiler::Duration::zero()) { | |||||
new_host_event(pid_str, 'f') | |||||
.id(event.tensor_id) | |||||
.bp('e') | |||||
.cat("TensorLink") | |||||
.scope(pid_str); | |||||
} | |||||
} else if constexpr (std::is_same_v<TEvent, TensorGetPropEvent>) { | |||||
new_host_event("TensorGetProp", 'X') | |||||
.dur(0).args(current_tensor->detail(current->time)); | |||||
} else if constexpr (std::is_same_v<TEvent, TensorWaitPropFinishEvent>) { | |||||
new_host_event(pid_str, 'f') | |||||
.id(event.tensor_id) | |||||
.bp('e') | |||||
.cat("TensorProp") | |||||
.scope(pid_str); | |||||
new_host_event("TensorWaitProp", 'E') | |||||
.args(current_tensor->detail(current->time)); | |||||
} else if constexpr (std::is_same_v<TEvent, TensorNotifyPropEvent>) { | |||||
new_host_event(pid_str, 's') | |||||
.id(event.tensor_id) | |||||
.cat("TensorProp") | |||||
.scope(pid_str); | |||||
} else if constexpr (std::is_same_v<TEvent, SampleDeviceFinishEvent>) { | |||||
std::string device_name = event.device.locator().to_string(); | |||||
new_host_event("memory", 'C') | |||||
.arg(ssprintf("%s_alloc_mem", device_name.c_str()), event.total_memory - event.free_memory); | |||||
} else if constexpr (std::is_same_v<TEvent, TensorCommandEvent>) { | |||||
new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B'); | |||||
} else if constexpr (std::is_same_v<TEvent, TensorCommandFinishEvent>) { | |||||
new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'E') | |||||
.args(current_tensor->detail(current->time)); | |||||
} else if constexpr (std::is_same_v<TEvent, ScopeEvent>) { | |||||
new_host_event(event.name, 'B'); | |||||
} else if constexpr (std::is_same_v<TEvent, ScopeFinishEvent>) { | |||||
new_host_event(event.name, 'E'); | |||||
} else if constexpr (std::is_same_v<TEvent, OpInputEvent>) { | |||||
new_host_event(ssprintf("Input %zu", event.tensor_id), 'B') | |||||
.args(current_tensor->detail(current->time)); | |||||
} else if constexpr (std::is_same_v<TEvent, OpInputFinishEvent>) { | |||||
new_host_event(ssprintf("Input %zu", event.tensor_id), 'E') | |||||
.args(current_tensor->detail(current->time)); | |||||
} else if constexpr (std::is_same_v<TEvent, OpOutputEvent>) { | |||||
new_host_event(ssprintf("Output %zu", event.tensor_id), 'B') | |||||
.args(current_tensor->detail(current->time)); | |||||
} else if constexpr (std::is_same_v<TEvent, OpOutputFinishEvent>) { | |||||
new_host_event(ssprintf("Output %zu", event.tensor_id), 'E') | |||||
.args(current_tensor->detail(current->time)); | |||||
} else if constexpr (std::is_same_v<TEvent, StartProfileEvent>) { | |||||
new_host_event("StartProfile", 'B'); | |||||
} else if constexpr (std::is_same_v<TEvent, StartProfileFinishEvent>) { | |||||
new_host_event("StartProfile", 'E'); | |||||
} else if constexpr (std::is_same_v<TEvent, StopProfileEvent>) { | |||||
new_host_event("StopProfile", 'B'); | |||||
} else if constexpr (std::is_same_v<TEvent, StopProfileFinishEvent>) { | |||||
new_host_event("StopProfile", 'E'); | |||||
} else if constexpr (std::is_same_v<TEvent, CustomEvent>) { | |||||
new_host_event(event.title, 'B'); | |||||
} else if constexpr (std::is_same_v<TEvent, CustomFinishEvent>) { | |||||
new_host_event(event.title, 'E'); | |||||
} else if constexpr (std::is_same_v<TEvent, AutoEvictEvent>) { | |||||
new_host_event("AutoEvict", 'B'); | |||||
} else if constexpr (std::is_same_v<TEvent, AutoEvictFinishEvent>) { | |||||
new_host_event("AutoEvict", 'E'); | |||||
} | } | ||||
trace_events.new_event().ts(0).name("thread_name").ph('M').pid(pid).tid(thread.index).arg("name", thread_dict[tid]); | |||||
} | } | ||||
for (auto&& [device, device_state]: state.devices) { | |||||
trace_events.new_event().ts(0).name("thread_name").ph('M').pid(pid).tid(256+device_state.index).arg("name", device.to_string()); | |||||
void notify_counter(std::string key, int64_t old_val, int64_t new_val) { | |||||
new_host_event(key, 'C').arg("value", new_val); | |||||
} | } | ||||
}; | |||||
trace_events.to_json()->writeto_fpath(filename); | |||||
void dump_chrome_timeline(std::string filename, Profiler::bundle_t result){ | |||||
ChromeTimelineEventVisitor visitor; | |||||
visitor.process_events(result); | |||||
visitor.trace_events.metadata("localTime") = std::to_string(result.start_at.time_since_epoch().count()); | |||||
std::string json_repr = visitor.trace_events.to_string(); | |||||
mgb::debug::write_to_file(filename.c_str(), json_repr); | |||||
} | } | ||||
} | } |
@@ -12,7 +12,9 @@ | |||||
#pragma once | #pragma once | ||||
#include "megbrain/utils/small_vector.h" | #include "megbrain/utils/small_vector.h" | ||||
#include "megbrain/imperative/profiler.h" | |||||
#include "../interpreter/stack_manager.h" | |||||
#include "../op_trait.h" | #include "../op_trait.h" | ||||
namespace mgb::imperative::profiler { | namespace mgb::imperative::profiler { | ||||
@@ -52,6 +54,11 @@ struct ToStringTrait<profiler::TensorProp>{ | |||||
namespace mgb::imperative::profiler { | namespace mgb::imperative::profiler { | ||||
using Trace = interpreter::intl::StackManager::Trace; | |||||
struct ProfileOperatorState; | |||||
struct ProfileTensorState; | |||||
#define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__; | #define DEF_EVENT(X, ...) struct X##Event __VA_ARGS__; | ||||
#define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__; | #define DEF_DUR_EVENT(X, ...) struct X##Event __VA_ARGS__; struct X##FinishEvent __VA_ARGS__; | ||||
@@ -61,6 +68,7 @@ DEF_EVENT(OpDispatch, { | |||||
std::function<OpParams()> op_params; | std::function<OpParams()> op_params; | ||||
SmallVector<uint64_t> inputs; | SmallVector<uint64_t> inputs; | ||||
SmallVector<uint64_t> outputs; | SmallVector<uint64_t> outputs; | ||||
Trace trace; | |||||
}); | }); | ||||
DEF_DUR_EVENT(OpInput, { | DEF_DUR_EVENT(OpInput, { | ||||
@@ -68,11 +76,6 @@ DEF_DUR_EVENT(OpInput, { | |||||
TensorShape shape; | TensorShape shape; | ||||
}); | }); | ||||
DEF_DUR_EVENT(OpDel, { | |||||
uint64_t tensor_id; | |||||
TensorShape shape; | |||||
}); | |||||
DEF_DUR_EVENT(OpOutput, { | DEF_DUR_EVENT(OpOutput, { | ||||
uint64_t tensor_id; | uint64_t tensor_id; | ||||
TensorShape shape; | TensorShape shape; | ||||
@@ -80,16 +83,13 @@ DEF_DUR_EVENT(OpOutput, { | |||||
DEF_DUR_EVENT(OpExecute, { | DEF_DUR_EVENT(OpExecute, { | ||||
uint64_t op_id; | uint64_t op_id; | ||||
SmallVector<CompNode> device_list; | |||||
}); | }); | ||||
DEF_DUR_EVENT(OpPostExecute, { | |||||
uint64_t op_id; | |||||
}); | |||||
DEF_DUR_EVENT(KernelExecute, { | |||||
DEF_DUR_EVENT(KernelLaunch, { | |||||
uint64_t op_id; | uint64_t op_id; | ||||
uint64_t kernel_id; | uint64_t kernel_id; | ||||
std::shared_ptr<CompNode::Event> event; | |||||
CompNode device; | |||||
}); | }); | ||||
DEF_EVENT(TensorDeclare, { | DEF_EVENT(TensorDeclare, { | ||||
@@ -128,19 +128,12 @@ DEF_EVENT(TensorNotifyProp, { | |||||
TensorProp prop; | TensorProp prop; | ||||
}); | }); | ||||
DEF_EVENT(TensorWaitProp, { | |||||
DEF_DUR_EVENT(TensorWaitProp, { | |||||
uint64_t tensor_id; | uint64_t tensor_id; | ||||
uint64_t wait_id; | uint64_t wait_id; | ||||
TensorProp prop; | TensorProp prop; | ||||
}); | }); | ||||
DEF_EVENT(TensorWaitPropFinish, { | |||||
uint64_t tensor_id; | |||||
uint64_t wait_id; | |||||
TensorProp prop; | |||||
bool notified; | |||||
}); | |||||
DEF_DUR_EVENT(SampleDevice, { | DEF_DUR_EVENT(SampleDevice, { | ||||
CompNode device; | CompNode device; | ||||
size_t total_memory; | size_t total_memory; | ||||
@@ -157,13 +150,10 @@ DEF_DUR_EVENT(Scope, { | |||||
std::string name; | std::string name; | ||||
}); | }); | ||||
DEF_DUR_EVENT(DeviceScope, { | |||||
std::string name; | |||||
std::shared_ptr<CompNode::Event> event; | |||||
DEF_DUR_EVENT(Sync, { | |||||
Trace trace; | |||||
}); | }); | ||||
DEF_DUR_EVENT(Sync, {}); | |||||
DEF_DUR_EVENT(StartProfile, { | DEF_DUR_EVENT(StartProfile, { | ||||
size_t capture_count; | size_t capture_count; | ||||
}); | }); | ||||
@@ -172,10 +162,13 @@ DEF_DUR_EVENT(StopProfile, { | |||||
size_t escape_count; | size_t escape_count; | ||||
}); | }); | ||||
enum class TensorCommandKind { | |||||
Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue | |||||
}; | |||||
DEF_DUR_EVENT(TensorCommand, { | DEF_DUR_EVENT(TensorCommand, { | ||||
enum Kind { | |||||
Put, Del, SwapIn, SwapOut, Drop, ReGen, RecFree, GetValue | |||||
}; | |||||
using Kind = TensorCommandKind; | |||||
uint64_t tensor_id; | uint64_t tensor_id; | ||||
Kind kind; | Kind kind; | ||||
}); | }); | ||||
@@ -187,6 +180,17 @@ DEF_DUR_EVENT(Custom, { | |||||
std::string content; | std::string content; | ||||
}); | }); | ||||
DEF_EVENT(RecordDevice, { | |||||
std::shared_ptr<CompNode::Event> event; | |||||
}); | |||||
DEF_DUR_EVENT(HostToDevice, { | |||||
TensorLayout layout; | |||||
CompNode device; | |||||
void* host_ptr; | |||||
void* device_ptr; | |||||
}); | |||||
#undef DEF_EVENT | #undef DEF_EVENT | ||||
#undef DEF_DUR_EVENT | #undef DEF_DUR_EVENT | ||||
@@ -15,10 +15,12 @@ | |||||
#include "megbrain/imperative/profiler.h" | #include "megbrain/imperative/profiler.h" | ||||
#include "./states.h" | |||||
namespace mgb::imperative::profiler { | namespace mgb::imperative::profiler { | ||||
void dump_chrome_timeline(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results); | |||||
void dump_chrome_timeline(std::string filename, Profiler::bundle_t result); | |||||
void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results); | |||||
void dump_memory_flow(std::string filename, Profiler::bundle_t result); | |||||
} | } |
@@ -89,7 +89,8 @@ struct MemoryChunk { | |||||
std::array<uintptr_t, 2> address; | std::array<uintptr_t, 2> address; | ||||
std::string name; | std::string name; | ||||
TensorLayout layout; | TensorLayout layout; | ||||
std::array<uint64_t, 2> time; | |||||
std::array<profiler::Duration, 2> time; | |||||
std::optional<uint64_t> group; | |||||
bool empty() const { | bool empty() const { | ||||
return address[1] - address[0] == 0; | return address[1] - address[0] == 0; | ||||
@@ -111,9 +112,9 @@ struct MemoryFlow { | |||||
return {addr_begin, addr_end}; | return {addr_begin, addr_end}; | ||||
} | } | ||||
std::pair<uint64_t, uint64_t> time_range() const { | |||||
auto time_begin = std::numeric_limits<uint64_t>::max(); | |||||
auto time_end = std::numeric_limits<uint64_t>::min(); | |||||
std::pair<profiler::Duration, profiler::Duration> time_range() const { | |||||
auto time_begin = profiler::Duration::max(); | |||||
auto time_end = profiler::Duration::min(); | |||||
for(auto&& [id, chunk]: chunks) { | for(auto&& [id, chunk]: chunks) { | ||||
MGB_MARK_USED_VAR(id); | MGB_MARK_USED_VAR(id); | ||||
if (chunk.empty()) continue; | if (chunk.empty()) continue; | ||||
@@ -123,27 +124,6 @@ struct MemoryFlow { | |||||
return {time_begin, time_end}; | return {time_begin, time_end}; | ||||
} | } | ||||
std::shared_ptr<json::Array> to_json() const { | |||||
auto results = json::Array::make(); | |||||
for(auto&& [id, chunk]: chunks) { | |||||
MGB_MARK_USED_VAR(id); | |||||
if (chunk.empty()) continue; | |||||
auto address = json::Array::make(); | |||||
auto time = json::Array::make(); | |||||
address->add(json::String::make(std::to_string(chunk.address[0]))); | |||||
address->add(json::String::make(std::to_string(chunk.address[1]))); | |||||
time->add(json::String::make(std::to_string(chunk.time[0]))); | |||||
time->add(json::String::make(std::to_string(chunk.time[1]))); | |||||
results->add(json::Object::make({ | |||||
{"address", address}, | |||||
{"name", json::String::make(chunk.name)}, | |||||
{"layout", json::String::make(chunk.layout.to_string())}, | |||||
{"time", time} | |||||
})); | |||||
} | |||||
return results; | |||||
} | |||||
XMLWriter to_svg() const { | XMLWriter to_svg() const { | ||||
XMLWriter writer; | XMLWriter writer; | ||||
auto&& [addr_begin, addr_end] = address_range(); | auto&& [addr_begin, addr_end] = address_range(); | ||||
@@ -157,13 +137,13 @@ struct MemoryFlow { | |||||
svg.attr("xmlns:tag", std::string{"https://megengine.org.cn"}); | svg.attr("xmlns:tag", std::string{"https://megengine.org.cn"}); | ||||
double time_scale = 1e5; | double time_scale = 1e5; | ||||
double addr_scale = 1e6; | double addr_scale = 1e6; | ||||
svg.attr("width", (time_end-time_begin)/time_scale); | |||||
svg.attr("width", (time_end-time_begin).count()/time_scale); | |||||
svg.attr("height", (addr_end-addr_begin)/addr_scale); | svg.attr("height", (addr_end-addr_begin)/addr_scale); | ||||
{ | { | ||||
auto rect = writer.element("rect"); | auto rect = writer.element("rect"); | ||||
rect.attr("x", 0); | rect.attr("x", 0); | ||||
rect.attr("y", 0); | rect.attr("y", 0); | ||||
rect.attr("width", (time_end-time_begin)/time_scale); | |||||
rect.attr("width", (time_end-time_begin).count()/time_scale); | |||||
rect.attr("height", (addr_end-addr_begin)/addr_scale); | rect.attr("height", (addr_end-addr_begin)/addr_scale); | ||||
rect.attr("fill", std::string{"blue"}); | rect.attr("fill", std::string{"blue"}); | ||||
} | } | ||||
@@ -177,7 +157,7 @@ struct MemoryFlow { | |||||
{1000 * ms, "#888888"}, | {1000 * ms, "#888888"}, | ||||
{std::numeric_limits<double>::infinity(), "#555555"}, | {std::numeric_limits<double>::infinity(), "#555555"}, | ||||
}; | }; | ||||
auto time2str = [](uint64_t ns){ | |||||
auto time2str = [](profiler::Duration ns){ | |||||
using pair_t = std::pair<uint64_t, const char*>; | using pair_t = std::pair<uint64_t, const char*>; | ||||
static pair_t units[] = { | static pair_t units[] = { | ||||
{1, "ns "}, | {1, "ns "}, | ||||
@@ -189,9 +169,9 @@ struct MemoryFlow { | |||||
auto comparator = [](const pair_t& lhs, const pair_t& rhs) { | auto comparator = [](const pair_t& lhs, const pair_t& rhs) { | ||||
return lhs.first < rhs.first; | return lhs.first < rhs.first; | ||||
}; | }; | ||||
while (ns > 0) { | |||||
auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns, ""), comparator) - 1; | |||||
builder += std::to_string(ns / iter->first) + iter->second; | |||||
while (ns.count() > 0) { | |||||
auto iter = std::upper_bound(std::begin(units), std::end(units), std::make_pair(ns.count(), ""), comparator) - 1; | |||||
builder += std::to_string(ns.count() / iter->first) + iter->second; | |||||
ns = ns % iter->first; | ns = ns % iter->first; | ||||
} | } | ||||
return builder; | return builder; | ||||
@@ -218,11 +198,11 @@ struct MemoryFlow { | |||||
for (auto&& [id, chunk]: chunks) { | for (auto&& [id, chunk]: chunks) { | ||||
MGB_MARK_USED_VAR(id); | MGB_MARK_USED_VAR(id); | ||||
if (chunk.empty()) continue; | if (chunk.empty()) continue; | ||||
double left = (chunk.time[0]-time_begin)/time_scale; | |||||
double right = (chunk.time[1]-time_begin)/time_scale; | |||||
double left = (chunk.time[0]-time_begin).count()/time_scale; | |||||
double right = (chunk.time[1]-time_begin).count()/time_scale; | |||||
double top = (chunk.address[0]-addr_begin)/addr_scale; | double top = (chunk.address[0]-addr_begin)/addr_scale; | ||||
double bottom = (chunk.address[1]-addr_begin)/addr_scale; | double bottom = (chunk.address[1]-addr_begin)/addr_scale; | ||||
double duration = chunk.time[1] - chunk.time[0]; | |||||
double duration = (chunk.time[1] - chunk.time[0]).count(); | |||||
{ | { | ||||
auto rect = writer.element("rect"); | auto rect = writer.element("rect"); | ||||
rect.attr("x", left); | rect.attr("x", left); | ||||
@@ -241,70 +221,48 @@ struct MemoryFlow { | |||||
mge_attr("produced", time2str(chunk.time[0])); | mge_attr("produced", time2str(chunk.time[0])); | ||||
mge_attr("erased", time2str(chunk.time[1])); | mge_attr("erased", time2str(chunk.time[1])); | ||||
mge_attr("duration", time2str(chunk.time[1] - chunk.time[0])); | mge_attr("duration", time2str(chunk.time[1] - chunk.time[0])); | ||||
if (chunk.group) { | |||||
mge_attr("group", std::to_string(*chunk.group)); | |||||
} | |||||
} | } | ||||
} | } | ||||
return writer; | return writer; | ||||
} | } | ||||
}; | }; | ||||
void dump_memory_flow(std::string filename, Profiler::options_t options, Profiler::thread_dict_t thread_dict, Profiler::results_t results) { | |||||
MemoryFlow flow; | |||||
ProfileDataCollector collector; | |||||
ProfileState state; | |||||
#define HANDLE_EVENT(type, ...) \ | |||||
collector.handle<type>([&](uint64_t id, std::thread::id tid, uint64_t time, type event) __VA_ARGS__ ); | |||||
HANDLE_EVENT(TensorDeclareEvent, { | |||||
auto& tensor_state = state.tensors[event.tensor_id] = {}; | |||||
tensor_state.id = event.tensor_id; | |||||
tensor_state.name = event.name; | |||||
}); | |||||
struct MemoryFlowVisitor: EventVisitor<MemoryFlowVisitor> { | |||||
MemoryFlow memory_flow; | |||||
HANDLE_EVENT(TensorProduceEvent, { | |||||
auto& tensor_state = state.tensors[event.tensor_id]; | |||||
tensor_state.device = event.device; | |||||
tensor_state.layout = event.layout; | |||||
tensor_state.produced = time; | |||||
state.tensors_by_size.insert({tensor_state.id, tensor_state.size_in_bytes()}); | |||||
state.tensors_by_produced.insert({tensor_state.id, tensor_state.produced}); | |||||
auto& chunk = flow.chunks[event.tensor_id]; | |||||
uintptr_t address = reinterpret_cast<uintptr_t>(event.ptr); | |||||
auto span = event.layout.span(); | |||||
auto dtype = event.layout.dtype; | |||||
// assume dtype is not lowbit | |||||
if (!address) { | |||||
chunk.address = {0, 0}; | |||||
} else { | |||||
chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()}; | |||||
template <typename TEvent> | |||||
void visit_event(const TEvent &event) { | |||||
if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) { | |||||
auto& chunk = memory_flow.chunks[event.tensor_id]; | |||||
uint64_t address = reinterpret_cast<uintptr_t>(event.ptr); | |||||
auto span = event.layout.span(); | |||||
auto dtype = event.layout.dtype; | |||||
// assume dtype is not lowbit | |||||
if (!address) { | |||||
chunk.address = {0, 0}; | |||||
} else { | |||||
chunk.address = {address+span.low_elem*dtype.size(), address+span.high_elem*dtype.size()}; | |||||
} | |||||
chunk.layout = event.layout; | |||||
chunk.time[0] = since_start(to_device_time(current->time, current_tensor->device)); | |||||
chunk.name = current_tensor->name; | |||||
chunk.group = current_tensor->source; | |||||
} else if constexpr (std::is_same_v<TEvent, TensorReleaseEvent>) { | |||||
auto& chunk = memory_flow.chunks[event.tensor_id]; | |||||
chunk.time[1] = since_start(to_device_time(current->time, current_tensor->device)); | |||||
} | } | ||||
chunk.layout = tensor_state.layout; | |||||
chunk.time[0] = time; | |||||
chunk.name = tensor_state.name; | |||||
}); | |||||
HANDLE_EVENT(TensorReleaseEvent, { | |||||
auto& tensor_state = state.tensors[event.tensor_id]; | |||||
state.tensors_by_size.erase({tensor_state.id, tensor_state.size_in_bytes()}); | |||||
state.tensors_by_produced.erase({tensor_state.id, tensor_state.produced}); | |||||
auto& chunk = flow.chunks[event.tensor_id]; | |||||
chunk.time[1] = time; | |||||
}); | |||||
HANDLE_EVENT(ScopeEvent, { | |||||
state.threads[tid].scope_stack.push_back(event.name); | |||||
}); | |||||
HANDLE_EVENT(ScopeFinishEvent, { | |||||
mgb_assert(state.threads[tid].scope_stack.back() == event.name); | |||||
state.threads[tid].scope_stack.pop_back(); | |||||
}); | |||||
for (auto&& result: results) { | |||||
collector(result.second.id, result.first, result.second.time, result.second.data); | |||||
} | } | ||||
debug::write_to_file(filename.c_str(), flow.to_svg().to_string()); | |||||
void notify_counter(std::string key, int64_t old_val, int64_t new_val) {} | |||||
}; | |||||
void dump_memory_flow(std::string filename, Profiler::bundle_t result) { | |||||
MemoryFlowVisitor visitor; | |||||
visitor.process_events(std::move(result)); | |||||
debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string()); | |||||
} | } | ||||
} | } |
@@ -3,6 +3,9 @@ | |||||
#include <set> | #include <set> | ||||
#include <any> | #include <any> | ||||
#include <typeindex> | #include <typeindex> | ||||
#include <sstream> | |||||
#include "nlohmann/json.hpp" | |||||
#include "megbrain/tensor.h" | #include "megbrain/tensor.h" | ||||
@@ -10,24 +13,16 @@ | |||||
namespace mgb::imperative::profiler { | namespace mgb::imperative::profiler { | ||||
struct ProfileDeviceState { | |||||
int64_t index; | |||||
CompNode device; | |||||
std::shared_ptr<CompNode::Event> base_event; | |||||
uint64_t base_time; //in ns | |||||
}; | |||||
struct ProfileWorkerState { | |||||
}; | |||||
using StackManager = interpreter::intl::StackManager; | |||||
struct ProfileTensorState { | struct ProfileTensorState { | ||||
uint64_t id; | |||||
uint64_t id = 0; | |||||
std::optional<uint64_t> source; | |||||
TensorLayout layout; | TensorLayout layout; | ||||
CompNode device; | CompNode device; | ||||
std::string name; | std::string name; | ||||
uint64_t produced = 0; | |||||
uint64_t living_time = 0; | |||||
profiler::HostTime produced = profiler::HostTime::min(); | |||||
profiler::Duration living_time = profiler::Duration::zero(); | |||||
size_t size_in_bytes() const { | size_t size_in_bytes() const { | ||||
if (!layout.dtype.valid()) { | if (!layout.dtype.valid()) { | ||||
@@ -35,41 +30,51 @@ struct ProfileTensorState { | |||||
} | } | ||||
return layout.dtype.size(layout.total_nr_elems()); | return layout.dtype.size(layout.total_nr_elems()); | ||||
} | } | ||||
}; | |||||
struct ProfileStaticsState { | |||||
size_t op_enqueue_count = 0; | |||||
size_t op_execute_count = 0; | |||||
size_t wait_value_count = 0; | |||||
size_t wait_shape_count = 0; | |||||
size_t exception_count = 0; | |||||
size_t infer_shape_valid_count = 0; | |||||
size_t infer_shape_invalid_count = 0; | |||||
size_t alive_tensor_count = 0; | |||||
size_t produce_tensor_count = 0; | |||||
size_t erase_tensor_count = 0; | |||||
size_t wait_prop_count = 0; | |||||
size_t redundant_tensor_count = 0; | |||||
std::string info(HostTime current_time) { | |||||
std::string shape = layout.TensorShape::to_string(); | |||||
std::string dtype = layout.dtype.name(); | |||||
return ssprintf("%s(%s:%s:%s)", name.c_str(), shape.c_str(), dtype.c_str(), device.to_string().c_str()); | |||||
} | |||||
nlohmann::json detail(HostTime current_time) { | |||||
nlohmann::json args; | |||||
args["id"] = id; | |||||
args["name"] = name; | |||||
args["shape"] = layout.TensorShape::to_string(); | |||||
args["dtype"] = layout.dtype.name(); | |||||
args["nr_elements"] = layout.total_nr_elems(); | |||||
args["device"] = device.to_string(); | |||||
if (produced != produced.min()) { | |||||
double ms_count = std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(current_time - produced + living_time).count(); | |||||
args["living_time"] = ssprintf("%lf ms", ms_count); | |||||
} | |||||
return args; | |||||
} | |||||
}; | }; | ||||
struct ProfileOperatorState { | struct ProfileOperatorState { | ||||
uint64_t id; | |||||
uint64_t id = 0; | |||||
std::string name; | std::string name; | ||||
OpParams params; | OpParams params; | ||||
SmallVector<uint64_t> inputs; | SmallVector<uint64_t> inputs; | ||||
SmallVector<uint64_t> outputs; | SmallVector<uint64_t> outputs; | ||||
CompNode device; | CompNode device; | ||||
Trace trace; | |||||
uint64_t host_begin; | |||||
uint64_t host_end; | |||||
std::shared_ptr<CompNode::Event> device_begin; | |||||
std::shared_ptr<CompNode::Event> device_end; | |||||
}; | |||||
profiler::HostTime execute_begin; | |||||
profiler::HostTime execute_end; | |||||
struct ProfileThreadState { | |||||
std::thread::id tid; | |||||
int64_t index; | |||||
std::vector<std::string> scope_stack; | |||||
nlohmann::json detail() { | |||||
nlohmann::json args; | |||||
for (auto&& [name, value]: params) { | |||||
args[name] = value; | |||||
} | |||||
args["__id__"] = id; | |||||
args["__name__"] = name; | |||||
args["__device__"] = device.to_string(); | |||||
return args; | |||||
} | |||||
}; | }; | ||||
template <typename TProp> | template <typename TProp> | ||||
@@ -93,37 +98,12 @@ struct ProfileTensorPropPair { | |||||
using ProfileTensorSizePair = ProfileTensorPropPair<size_t>; | using ProfileTensorSizePair = ProfileTensorPropPair<size_t>; | ||||
using ProfileTensorProducedPair = ProfileTensorPropPair<uint64_t>; | using ProfileTensorProducedPair = ProfileTensorPropPair<uint64_t>; | ||||
struct GeneralTensorEvent { | |||||
uint64_t tensor_id; | |||||
std::type_index type; | |||||
}; | |||||
struct ProfileState { | struct ProfileState { | ||||
std::unordered_map<uint64_t, ProfileTensorState> tensors; | std::unordered_map<uint64_t, ProfileTensorState> tensors; | ||||
std::unordered_map<uint64_t, ProfileOperatorState> operators; | std::unordered_map<uint64_t, ProfileOperatorState> operators; | ||||
std::unordered_map<std::string, uint64_t> tensor_name_counter; | std::unordered_map<std::string, uint64_t> tensor_name_counter; | ||||
std::set<ProfileTensorSizePair> tensors_by_size; | std::set<ProfileTensorSizePair> tensors_by_size; | ||||
std::set<ProfileTensorSizePair> tensors_by_produced; | std::set<ProfileTensorSizePair> tensors_by_produced; | ||||
ProfileWorkerState worker; | |||||
ProfileStaticsState statics; | |||||
std::unordered_map<std::thread::id, ProfileThreadState> threads; | |||||
CompNode::UnorderedMap<ProfileDeviceState> devices; | |||||
ProfileThreadState& operator[](std::thread::id tid) { | |||||
if (threads.count(tid) == 0) { | |||||
threads[tid].tid = tid; | |||||
threads[tid].index = threads.size(); | |||||
} | |||||
return threads[tid]; | |||||
} | |||||
ProfileDeviceState& operator[](CompNode device) { | |||||
if (devices.count(device) == 0) { | |||||
devices[device].device = device; | |||||
devices[device].index = devices.size(); | |||||
} | |||||
return devices[device]; | |||||
} | |||||
std::vector<uint64_t> top_k_tensor_in_device(CompNode device, size_t k) { | std::vector<uint64_t> top_k_tensor_in_device(CompNode device, size_t k) { | ||||
std::vector<uint64_t> results; | std::vector<uint64_t> results; | ||||
@@ -138,19 +118,233 @@ struct ProfileState { | |||||
} | } | ||||
return results; | return results; | ||||
} | } | ||||
}; | |||||
std::string concat_scope(std::thread::id tid) { | |||||
auto& scope_stack = threads[tid].scope_stack; | |||||
if (scope_stack.empty()) { | |||||
return {}; | |||||
} | |||||
std::string result = scope_stack[0]; | |||||
for (size_t i = 1; i < scope_stack.size(); ++i) { | |||||
result += "::"; | |||||
result += scope_stack[i]; | |||||
template<typename T, typename = void> | |||||
struct is_op_event : std::false_type { }; | |||||
template<typename T> | |||||
struct is_op_event<T, decltype(std::declval<T>().op_id, void())> : std::true_type { }; | |||||
template<typename T, typename = void> | |||||
struct is_tensor_event : std::false_type { }; | |||||
template<typename T> | |||||
struct is_tensor_event<T, decltype(std::declval<T>().tensor_id, void())> : std::true_type { }; | |||||
template<typename T, typename = void> | |||||
struct is_trace_event : std::false_type { }; | |||||
template<typename T> | |||||
struct is_trace_event<T, decltype(std::declval<T>().trace, void())> : std::true_type { }; | |||||
template <typename... TItems> | |||||
class AnyToVariantConverter { | |||||
public: | |||||
using any_t = std::any; | |||||
using variant_t = std::variant<TItems...>; | |||||
private: | |||||
std::unordered_map<std::type_index, std::function<variant_t(any_t)>> m_table; | |||||
template <typename TItem> | |||||
void register_converter() { | |||||
m_table[typeid(TItem)] = [](any_t input) { | |||||
return variant_t(std::any_cast<TItem>(std::move(input))); | |||||
}; | |||||
} | |||||
public: | |||||
AnyToVariantConverter() { | |||||
(register_converter<TItems>(), ...); | |||||
} | |||||
variant_t operator()(any_t input) { | |||||
return m_table[input.type()](std::move(input)); | |||||
} | |||||
}; | |||||
template <typename TSelf> | |||||
class EventVisitor { | |||||
private: | |||||
std::unordered_map<size_t, ProfileOperatorState> m_operators; | |||||
std::unordered_map<size_t, ProfileTensorState> m_tensors; | |||||
std::unordered_map<size_t, std::vector<Profiler::Record>> m_duration_stack; | |||||
HostTime m_start_time; | |||||
CompNode::UnorderedMap<size_t> m_device_tid_table; | |||||
std::unordered_map<std::thread::id, size_t> m_host_tid_table; | |||||
CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> m_device_timeline; | |||||
std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack; | |||||
std::unordered_map<std::string, int64_t> m_counter_table; | |||||
protected: | |||||
Profiler::Record* current; | |||||
ProfileOperatorState* current_op; | |||||
ProfileTensorState* current_tensor; | |||||
protected: | |||||
profiler::Duration since_start(profiler::HostTime time) { | |||||
return time - m_start_time; | |||||
} | |||||
profiler::HostTime to_device_time(profiler::HostTime time, CompNode device) { | |||||
auto& device_timeline = m_device_timeline[device]; | |||||
auto upper = device_timeline.lower_bound(time); | |||||
if (upper == device_timeline.end()) { | |||||
if (upper == device_timeline.begin()) { | |||||
return time; | |||||
} else { | |||||
--upper; | |||||
return time + std::chrono::duration_cast<profiler::Duration>(upper->second); | |||||
} | |||||
} else if (upper->first == time) { | |||||
return time + std::chrono::duration_cast<profiler::Duration>(upper->second); | |||||
} else if (upper == device_timeline.begin()) { | |||||
return time + std::chrono::duration_cast<profiler::Duration>(upper->second); | |||||
} | } | ||||
auto lower = upper; | |||||
-- lower; | |||||
double ratio = ((double)(time - lower->first).count() / (double)(upper->first - lower->first).count()); | |||||
mgb_assert(ratio > 0 && ratio < 1, "invalid ratio"); | |||||
mgb_assert(lower->first + lower->second <= upper->first + upper->second, "device time corr"); | |||||
auto shift = lower->second + ratio * (upper->second - lower->second); | |||||
auto result = time + std::chrono::duration_cast<profiler::Duration>(shift); | |||||
return result; | return result; | ||||
} | } | ||||
size_t to_tid(std::thread::id host_tid) { | |||||
return m_host_tid_table.at(host_tid); | |||||
} | |||||
size_t to_tid(CompNode device) { | |||||
return m_device_tid_table.at(device); | |||||
} | |||||
void inc_counter(const char* key, int64_t delta) { | |||||
if (!m_counter_table.count(key)) { | |||||
m_counter_table[key] = 0; | |||||
} | |||||
auto& value = m_counter_table[key]; | |||||
static_cast<TSelf&>(*this).notify_counter(key, value, value + delta); | |||||
value += delta; | |||||
} | |||||
public: | |||||
void process_events(Profiler::bundle_t bundle) { | |||||
m_start_time = bundle.start_at; | |||||
auto& self = static_cast<TSelf&>(*this); | |||||
AnyToVariantConverter<OpDispatchEvent, OpExecuteEvent, OpExecuteFinishEvent, | |||||
KernelLaunchEvent, KernelLaunchFinishEvent, | |||||
OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent, | |||||
TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent, | |||||
TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent, | |||||
SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, | |||||
StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent, | |||||
TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent, | |||||
CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent> converter; | |||||
auto for_each_entry = [&](auto&& handler) { | |||||
for (auto& entry: bundle.entries) { | |||||
current = &entry; | |||||
std::visit(handler, converter(entry.data)); | |||||
} | |||||
current = nullptr; | |||||
}; | |||||
// build device timeline | |||||
struct DeviceStartPair { | |||||
profiler::HostTime host; | |||||
std::shared_ptr<CompNode::Event> device; | |||||
}; | |||||
CompNode::UnorderedMap<DeviceStartPair> device_start_table; | |||||
for_each_entry([&](auto&& event){ | |||||
using T = std::decay_t<decltype(event)>; | |||||
if constexpr (std::is_same_v<T, RecordDeviceEvent>) { | |||||
using namespace std::chrono_literals; | |||||
DeviceStartPair& device_start = device_start_table[event.event->comp_node()]; | |||||
if (!device_start.device) { | |||||
device_start = { current->time, event.event }; | |||||
} | |||||
event.event->host_wait(); | |||||
auto device_time = (device_start.host - current->time) + std::chrono::duration_cast<profiler::RealDuration>(device_start.device->elapsed_time_until(*event.event) * 1s); | |||||
m_device_timeline[event.event->comp_node()][current->time] = device_time; | |||||
} | |||||
}); | |||||
// register host threads | |||||
for_each_entry([&](auto&& event){ | |||||
if (!m_host_tid_table.count(current->tid)) { | |||||
m_host_tid_table[current->tid] = {m_device_tid_table.size() + m_host_tid_table.size()}; | |||||
} | |||||
}); | |||||
for_each_entry([&](auto&& event){ | |||||
using T = std::decay_t<decltype(event)>; | |||||
if constexpr (std::is_same_v<T, OpDispatchEvent>) { | |||||
auto& op = m_operators[event.op_id]; | |||||
mgb_assert(op.id == 0, "duplicate operator id"); | |||||
op.id = event.op_id; | |||||
op.name = event.op_name; | |||||
op.params = event.op_params(); | |||||
op.inputs = event.inputs; | |||||
op.outputs = event.outputs; | |||||
op.trace = event.trace; | |||||
for (auto&& output: event.outputs) { | |||||
m_tensors.at(output).source = op.id; | |||||
} | |||||
} else if constexpr (std::is_same_v<T, TensorDeclareEvent>) { | |||||
auto& tensor = m_tensors[event.tensor_id]; | |||||
mgb_assert(tensor.id == 0, "duplicated tensor id"); | |||||
tensor.id = event.tensor_id; | |||||
tensor.name = event.name; | |||||
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | |||||
auto& tensor = m_tensors.at(event.tensor_id); | |||||
if (!m_device_tid_table.count(event.device)) { | |||||
m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; | |||||
} | |||||
tensor.device = event.device; | |||||
} | |||||
}); | |||||
// replay execution | |||||
using namespace std::placeholders; | |||||
for_each_entry([&](auto&& event){ | |||||
using T = std::decay_t<decltype(event)>; | |||||
// update current_op/tensor | |||||
if constexpr (is_op_event<T>::value) { | |||||
current_op = &m_operators.at(event.op_id); | |||||
} else if constexpr (is_tensor_event<T>::value) { | |||||
current_tensor = &m_tensors.at(event.tensor_id); | |||||
} | |||||
if constexpr (std::is_same_v<T, OpExecuteEvent>) { | |||||
current_op->execute_begin = current->time; | |||||
} else if constexpr (std::is_same_v<T, OpExecuteFinishEvent>) { | |||||
current_op->execute_end = current->time; | |||||
} | |||||
// update counters | |||||
if constexpr (std::is_same_v<T, OpDispatchEvent>) { | |||||
inc_counter("nr_op_pending", 1); | |||||
} else if constexpr (std::is_same_v<T, OpExecuteEvent>) { | |||||
inc_counter("nr_op_pending", -1); | |||||
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | |||||
inc_counter("nr_alive_tensor", 1); | |||||
} else if constexpr (std::is_same_v<T, TensorReleaseEvent>) { | |||||
inc_counter("nr_alive_tensor", -1); | |||||
} else if constexpr (std::is_same_v<T, TensorEraseEvent>) { | |||||
if (event.use_count == 0) { | |||||
inc_counter("nr_redunant_tensor", 1); | |||||
} | |||||
} else if constexpr (std::is_same_v<T, ShapeInferEvent>) { | |||||
if (!event.success) { | |||||
inc_counter("nr_shape_infer_failure", 1); | |||||
} | |||||
} else if constexpr (std::is_same_v<T, WorkerExceptionEvent>) { | |||||
inc_counter("nr_exception", 1); | |||||
} | |||||
// visit_event_impl | |||||
self.visit_event(event); | |||||
// reset current_op/tensor | |||||
if constexpr (is_op_event<T>::value) { | |||||
current_op = nullptr; | |||||
} else if constexpr (is_tensor_event<T>::value) { | |||||
current_tensor = nullptr; | |||||
} | |||||
}); | |||||
} | |||||
}; | }; | ||||
} | } |
@@ -25,6 +25,9 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { | |||||
auto on_seq_start = [this](CompSeqExecBeforeStart const& event) { | auto on_seq_start = [this](CompSeqExecBeforeStart const& event) { | ||||
// reset | // reset | ||||
mgb_assert(!event.graph->options().imperative_proxy_graph); | mgb_assert(!event.graph->options().imperative_proxy_graph); | ||||
CompNode::foreach([](CompNode device){ | |||||
Profiler::record<RecordDeviceEvent>(Timer::record_device(device)); | |||||
}); | |||||
if (m_opr_dict.empty() && m_var_dict.empty()) { | if (m_opr_dict.empty() && m_var_dict.empty()) { | ||||
init_seq(event.exec); | init_seq(event.exec); | ||||
} | } | ||||
@@ -122,11 +125,13 @@ ProfilerPlugin::ProfilerPlugin(cg::ComputingGraph* graph): PluginBase(graph) { | |||||
}; | }; | ||||
auto on_before_kern = [this](BeforeKernel const& event) { | auto on_before_kern = [this](BeforeKernel const& event) { | ||||
OperatorNodeBase* opr = event.opr; | OperatorNodeBase* opr = event.opr; | ||||
Profiler::record<KernelExecuteEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node)); | |||||
Profiler::record<KernelLaunchEvent>(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node); | |||||
Profiler::record<RecordDeviceEvent>(Timer::record_device(event.comp_node)); | |||||
}; | }; | ||||
auto on_after_kern = [this](AfterKernel const& event) { | auto on_after_kern = [this](AfterKernel const& event) { | ||||
OperatorNodeBase* opr = event.opr; | OperatorNodeBase* opr = event.opr; | ||||
Profiler::record<KernelExecuteFinishEvent>(get_opr_info(opr).id, get_opr_info(opr).id, Timer::record_event(event.comp_node)); | |||||
Profiler::record<RecordDeviceEvent>(Timer::record_device(event.comp_node)); | |||||
Profiler::record<KernelLaunchEvent>(get_opr_info(opr).id, get_opr_info(opr).id, event.comp_node); | |||||
}; | }; | ||||
auto on_graph_compile = [this](const CompSeqOrderDetermined&) { | auto on_graph_compile = [this](const CompSeqOrderDetermined&) { | ||||
m_opr_dict.clear(); | m_opr_dict.clear(); | ||||
@@ -32,15 +32,22 @@ | |||||
namespace mgb { | namespace mgb { | ||||
namespace imperative { | namespace imperative { | ||||
namespace profiler { | |||||
using HostTime = std::chrono::time_point<std::chrono::high_resolution_clock>; | |||||
using Duration = std::chrono::nanoseconds; | |||||
using RealDuration = std::chrono::duration<double, std::nano>; | |||||
using Time = HostTime; | |||||
} // namespace profiler | |||||
class Timer { | class Timer { | ||||
public: | public: | ||||
void reset(); | |||||
uint64_t get_nsecs(); | |||||
uint64_t get_started_at(); | |||||
static std::shared_ptr<CompNode::Event> record_event(CompNode device); | |||||
private: | |||||
decltype(std::chrono::steady_clock::now()) m_start; | |||||
uint64_t m_started_at; | |||||
using Time = profiler::Time; | |||||
static profiler::Time record_host(); | |||||
static std::shared_ptr<CompNode::Event> record_device(CompNode device); | |||||
}; | }; | ||||
@@ -48,7 +55,8 @@ class Profiler { | |||||
public: | public: | ||||
struct Record { | struct Record { | ||||
uint64_t id; | uint64_t id; | ||||
uint64_t time; //in ns | |||||
std::thread::id tid; | |||||
profiler::Time time; | |||||
std::any data; | std::any data; | ||||
}; | }; | ||||
enum Status: uint8_t { | enum Status: uint8_t { | ||||
@@ -56,23 +64,32 @@ public: | |||||
Recording = 1, | Recording = 1, | ||||
Collecting = 2, | Collecting = 2, | ||||
}; | }; | ||||
using ProfileCollector = std::function<void(std::thread::id, Record)>; | |||||
struct ResultBundle; | |||||
using ProfileCollector = std::function<void(Record)>; | |||||
using option_t = uint64_t; | using option_t = uint64_t; | ||||
using options_t = std::unordered_map<std::string, option_t>; | using options_t = std::unordered_map<std::string, option_t>; | ||||
using result_t = std::pair<std::thread::id, Record>; | |||||
using results_t = std::vector<result_t>; | |||||
using entry_t = Record; | |||||
using bundle_t = ResultBundle; | |||||
using thread_dict_t = std::unordered_map<std::thread::id, std::string>; | using thread_dict_t = std::unordered_map<std::thread::id, std::string>; | ||||
struct ResultBundle { | |||||
profiler::HostTime start_at; | |||||
thread_dict_t thread_dict; | |||||
options_t options; | |||||
std::vector<entry_t> entries; | |||||
}; | |||||
private: | private: | ||||
std::thread::id m_thread_id; | std::thread::id m_thread_id; | ||||
std::vector<Record> m_records; | std::vector<Record> m_records; | ||||
std::vector<std::any> m_duration_stack; | |||||
std::atomic<Status> m_status = Running; | std::atomic<Status> m_status = Running; | ||||
uint64_t m_last_time = 0; | |||||
std::string m_thread_name; | std::string m_thread_name; | ||||
static options_t sm_profile_options; | static options_t sm_profile_options; | ||||
static std::mutex sm_mutex; | static std::mutex sm_mutex; | ||||
static std::unordered_map<std::thread::id, Profiler*> sm_profilers; | static std::unordered_map<std::thread::id, Profiler*> sm_profilers; | ||||
static Timer sm_timer; | static Timer sm_timer; | ||||
static profiler::HostTime sm_start_at; | |||||
static std::atomic_uint64_t sm_last_id; | static std::atomic_uint64_t sm_last_id; | ||||
static std::atomic_size_t sm_preferred_capacity; | static std::atomic_size_t sm_preferred_capacity; | ||||
static bool sm_profiling; | static bool sm_profiling; | ||||
@@ -100,7 +117,7 @@ public: | |||||
static void reset() { | static void reset() { | ||||
mgb_assert(sm_profilers.size() == 0, "profiler already running"); | mgb_assert(sm_profilers.size() == 0, "profiler already running"); | ||||
sm_timer.reset(); | |||||
sm_start_at = profiler::HostTime::min(); | |||||
} | } | ||||
static uint64_t next_id() { | static uint64_t next_id() { | ||||
@@ -110,16 +127,13 @@ public: | |||||
template <typename T, typename... TArgs> | template <typename T, typename... TArgs> | ||||
static uint64_t record(TArgs&&... args) { | static uint64_t record(TArgs&&... args) { | ||||
auto& profiler = get_instance(); | auto& profiler = get_instance(); | ||||
auto last_time = profiler.m_last_time; | |||||
if constexpr (sm_debug) { | if constexpr (sm_debug) { | ||||
Status expected = Running; | Status expected = Running; | ||||
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); | mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); | ||||
} | } | ||||
uint64_t id = next_id(); | uint64_t id = next_id(); | ||||
uint64_t time = sm_timer.get_nsecs(); | |||||
time = std::max(time, last_time + 2000); | |||||
profiler.m_last_time = time; | |||||
profiler.m_records.push_back({id, time, T{std::forward<TArgs>(args)...}}); | |||||
profiler::Time time = sm_timer.record_host(); | |||||
profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward<TArgs>(args)...}}); | |||||
if constexpr (sm_debug) { | if constexpr (sm_debug) { | ||||
Status expected = Recording; | Status expected = Recording; | ||||
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); | mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); | ||||
@@ -127,7 +141,8 @@ public: | |||||
return id; | return id; | ||||
} | } | ||||
static results_t collect() { | |||||
static bundle_t collect() { | |||||
bundle_t bundle; | |||||
MGB_LOCK_GUARD(sm_mutex); | MGB_LOCK_GUARD(sm_mutex); | ||||
if constexpr (sm_debug) { | if constexpr (sm_debug) { | ||||
for (auto&& [tid, profiler]: sm_profilers) { | for (auto&& [tid, profiler]: sm_profilers) { | ||||
@@ -136,17 +151,17 @@ public: | |||||
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting)); | mgb_assert(profiler->m_status.compare_exchange_strong(expected, Collecting)); | ||||
} | } | ||||
} | } | ||||
std::vector<std::pair<std::thread::id, Record>> profile_data; | |||||
std::vector<entry_t> profile_data; | |||||
for (auto&& [tid, profiler]: sm_profilers) { | for (auto&& [tid, profiler]: sm_profilers) { | ||||
sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); | sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); | ||||
for (auto& record: profiler->m_records) { | for (auto& record: profiler->m_records) { | ||||
profile_data.push_back({tid, std::move(record)}); | |||||
profile_data.push_back(std::move(record)); | |||||
} | } | ||||
profiler->m_records.clear(); | profiler->m_records.clear(); | ||||
profiler->m_records.reserve(sm_preferred_capacity); | profiler->m_records.reserve(sm_preferred_capacity); | ||||
} | } | ||||
std::sort(profile_data.begin(), profile_data.end(), [](auto& lhs, auto& rhs){ | std::sort(profile_data.begin(), profile_data.end(), [](auto& lhs, auto& rhs){ | ||||
return lhs.second.id < rhs.second.id; | |||||
return lhs.id < rhs.id; | |||||
}); | }); | ||||
if constexpr (sm_debug) { | if constexpr (sm_debug) { | ||||
for (auto&& [tid, profiler]: sm_profilers) { | for (auto&& [tid, profiler]: sm_profilers) { | ||||
@@ -155,7 +170,11 @@ public: | |||||
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); | mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); | ||||
} | } | ||||
} | } | ||||
return profile_data; | |||||
bundle.entries = profile_data; | |||||
bundle.options = get_options(); | |||||
bundle.start_at = sm_start_at; | |||||
bundle.thread_dict = get_thread_dict(); | |||||
return bundle; | |||||
} | } | ||||
static option_t get_option(std::string key, option_t default_val) { | static option_t get_option(std::string key, option_t default_val) { | ||||
@@ -179,6 +198,7 @@ public: | |||||
static void start_profile() { | static void start_profile() { | ||||
mgb_assert(!sm_profiling); | mgb_assert(!sm_profiling); | ||||
sm_start_at = Timer::record_host(); | |||||
sm_profiling = true; | sm_profiling = true; | ||||
} | } | ||||
@@ -189,7 +209,7 @@ public: | |||||
static thread_dict_t get_thread_dict(); | static thread_dict_t get_thread_dict(); | ||||
static void dump_profile(std::string basename, std::string format, results_t results, options_t options); | |||||
static void dump_profile(std::string basename, std::string format, bundle_t result); | |||||
}; | }; | ||||
@@ -19,7 +19,7 @@ target_include_directories(imperative_test PRIVATE ${MODULE_SRC_INCLUDE} ${PYTHO | |||||
target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C) | target_compile_definitions(imperative_test PRIVATE MODULE_NAME=C) | ||||
target_compile_options(imperative_test PRIVATE -Wno-unused-parameter) | target_compile_options(imperative_test PRIVATE -Wno-unused-parameter) | ||||
set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3) | |||||
set(LINK_LIBS megbrain megdnn ${MGE_CUDA_LIBS} gtest gmock pybind11::embed range-v3 nlohmann_json::nlohmann_json) | |||||
if(MGE_WITH_CUDA) | if(MGE_WITH_CUDA) | ||||
list(APPEND LINK_LIBS cudart) | list(APPEND LINK_LIBS cudart) | ||||