@@ -1012,9 +1012,10 @@ void init_tensor(py::module m) { | |||
interpreter_for_py->stop_profile(); | |||
interpreter_for_py->sync(); | |||
imperative::Profiler::stop_profile(); | |||
auto results = imperative::Profiler::collect(); | |||
return [results=std::move(results)](std::string basename, std::string format){ | |||
imperative::Profiler::dump_profile(basename, format, results); | |||
auto results = std::make_shared<imperative::Profiler::bundle_t>(imperative::Profiler::collect()); | |||
return [results=results](std::string basename, std::string format) mutable { | |||
imperative::Profiler::dump_profile(basename, format, std::move(*results)); | |||
results = nullptr; | |||
}; | |||
}, py::call_guard<py::gil_scoped_release>()); | |||
m.def("sync", | |||
@@ -1133,9 +1133,7 @@ void ChannelImpl::process_one_task(Command& icmd) { | |||
} | |||
} | |||
CompNode::foreach([&](CompNode device){ | |||
if (Profiler::get_option("sample_rate", 0)) { | |||
sample_on_device(device, true); | |||
} | |||
sample_on_device(device, true); | |||
MGB_RECORD_EVENT_IF((Profiler::get_option("profile_device", 0)), RecordDeviceEvent, Timer::record_device(device)); | |||
}); | |||
MGB_RECORD_EVENT(StartProfileFinishEvent); | |||
@@ -1149,9 +1147,7 @@ void ChannelImpl::process_one_task(Command& icmd) { | |||
MGB_RECORD_EVENT(TensorEraseEvent, info->id); | |||
} | |||
CompNode::foreach([&](CompNode device){ | |||
if (Profiler::get_option("sample_rate", 0)) { | |||
sample_on_device(device, true); | |||
} | |||
sample_on_device(device, true); | |||
}); | |||
MGB_RECORD_EVENT(StopProfileFinishEvent); | |||
} else if constexpr (std::is_same_v<T, PushScope>) { | |||
@@ -28,7 +28,7 @@ namespace mgb { | |||
namespace imperative { | |||
profiler::Time Timer::record_host() { | |||
return std::chrono::high_resolution_clock::now(); | |||
return std::chrono::system_clock::now(); | |||
} | |||
std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) { | |||
@@ -40,12 +40,12 @@ std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) { | |||
std::vector<Profiler::entry_t> Profiler::sm_records; | |||
Profiler::options_t Profiler::sm_profile_options; | |||
std::mutex Profiler::sm_mutex; | |||
std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers; | |||
std::unordered_map<std::thread::id, std::unique_ptr<Profiler>> Profiler::sm_profilers; | |||
Timer Profiler::sm_timer; | |||
profiler::HostTime Profiler::sm_start_at = profiler::HostTime::min(); | |||
std::atomic_uint64_t Profiler::sm_last_id = 0; | |||
bool Profiler::sm_profiling = false; | |||
thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>(); | |||
thread_local Profiler* Profiler::tm_profiler = nullptr; | |||
std::atomic_size_t Profiler::sm_preferred_capacity; | |||
auto Profiler::get_thread_dict() -> thread_dict_t { | |||
@@ -65,7 +65,7 @@ void Profiler::dump_profile(std::string basename, std::string format, bundle_t r | |||
if (iter == format_table.end()) { | |||
mgb_log_error("unsupported profiling format %s", format.c_str()); | |||
} | |||
return (iter->second)(basename, result); | |||
return (iter->second)(basename, std::move(result)); | |||
} | |||
} // namespace imperative | |||
@@ -268,7 +268,7 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||
.cat("Kernel") | |||
.args(current_op->detail()); | |||
} else if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) { | |||
if (current_tensor->living_time != profiler::Duration::zero()) { | |||
if (current_tensor->living_time == profiler::Duration::zero()) { | |||
new_host_event(pid_str, 's') | |||
.id(event.tensor_id) | |||
.cat("TensorLink") | |||
@@ -319,8 +319,8 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||
.scope(pid_str); | |||
} else if constexpr (std::is_same_v<TEvent, SampleDeviceFinishEvent>) { | |||
std::string device_name = event.device.locator().to_string(); | |||
new_host_event("memory", 'C') | |||
.arg(ssprintf("%s_alloc_mem", device_name.c_str()), event.total_memory - event.free_memory); | |||
new_host_event(ssprintf("%s_alloc_mem", device_name.c_str()), 'C') | |||
.arg("value", event.total_memory - event.free_memory); | |||
} else if constexpr (std::is_same_v<TEvent, TensorCommandEvent>) { | |||
new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B'); | |||
} else if constexpr (std::is_same_v<TEvent, TensorCommandFinishEvent>) { | |||
@@ -366,6 +366,12 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> { | |||
.arg("dtype", event.layout.dtype.name()) | |||
.arg("nr_elements", event.layout.total_nr_elems()) | |||
.arg("device", event.device.to_string()); | |||
} else if constexpr (std::is_same_v<TEvent, RecordDeviceEvent>) { | |||
auto current_host_time = current->time; | |||
auto current_device_time = to_device_time(current->time, event.event->comp_node()); | |||
auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>(current_device_time-current_host_time); | |||
new_host_event("device_ahead_ms", 'C') | |||
.arg("value", device_ahead.count()); | |||
} | |||
} | |||
@@ -261,7 +261,7 @@ struct MemoryFlowVisitor: EventVisitor<MemoryFlowVisitor> { | |||
void dump_memory_flow(std::string filename, Profiler::bundle_t result) { | |||
MemoryFlowVisitor visitor; | |||
visitor.process_events(std::move(result)); | |||
visitor.process_events(result); | |||
debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string()); | |||
} | |||
@@ -139,22 +139,22 @@ struct is_trace_event<T, decltype(std::declval<T>().trace, void())> : std::true_ | |||
template <typename... TItems> | |||
class AnyToVariantConverter { | |||
public: | |||
using any_t = std::any; | |||
using any_t = AnyPtr; | |||
using variant_t = std::variant<TItems...>; | |||
private: | |||
std::unordered_map<std::type_index, std::function<variant_t(any_t)>> m_table; | |||
std::unordered_map<std::type_index, std::function<variant_t(const any_t&)>> m_table; | |||
template <typename TItem> | |||
void register_converter() { | |||
m_table[typeid(TItem)] = [](any_t input) { | |||
return variant_t(std::any_cast<TItem>(std::move(input))); | |||
m_table[typeid(TItem)] = [](const any_t& input) { | |||
return variant_t(*input.as<TItem>()); | |||
}; | |||
} | |||
public: | |||
AnyToVariantConverter() { | |||
(register_converter<TItems>(), ...); | |||
} | |||
variant_t operator()(any_t input) { | |||
variant_t operator()(const any_t& input) { | |||
return m_table[input.type()](std::move(input)); | |||
} | |||
}; | |||
@@ -222,7 +222,7 @@ protected: | |||
value += delta; | |||
} | |||
public: | |||
void process_events(Profiler::bundle_t bundle) { | |||
void process_events(Profiler::bundle_t& bundle) { | |||
m_start_time = bundle.start_at; | |||
auto& self = static_cast<TSelf&>(*this); | |||
@@ -231,7 +231,7 @@ public: | |||
OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent, | |||
TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent, | |||
TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent, | |||
SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, | |||
SampleDeviceEvent, SampleDeviceFinishEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent, | |||
StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent, | |||
TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent, | |||
CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent, | |||
@@ -298,6 +298,7 @@ public: | |||
m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; | |||
} | |||
tensor.device = event.device; | |||
tensor.layout = event.layout; | |||
} | |||
}); | |||
@@ -34,9 +34,10 @@ namespace imperative { | |||
namespace profiler { | |||
using HostTime = std::chrono::time_point<std::chrono::high_resolution_clock>; | |||
using HostTime = std::chrono::time_point<std::chrono::system_clock>; | |||
using Duration = std::chrono::nanoseconds; | |||
using RealDuration = std::chrono::duration<double, std::nano>; | |||
using Time = HostTime; | |||
@@ -50,6 +51,52 @@ public: | |||
static std::shared_ptr<CompNode::Event> record_device(CompNode device); | |||
}; | |||
class AnyPtr { | |||
public: | |||
struct Deleter { | |||
void* object; | |||
void (*method)(void*, void*); | |||
void operator() (void* ptr) { | |||
method(object, ptr); | |||
} | |||
}; | |||
private: | |||
using holder_t = std::unique_ptr<void, Deleter>; | |||
const std::type_info* m_type = nullptr; | |||
holder_t m_holder = nullptr; | |||
public: | |||
AnyPtr() = default; | |||
template <typename T, typename=std::enable_if_t<!std::is_same_v<std::decay_t<T>, AnyPtr>>> | |||
explicit AnyPtr(T* value, Deleter deleter) { | |||
m_type = &typeid(T); | |||
m_holder = {value, deleter}; | |||
} | |||
template <typename T> | |||
T* as() { | |||
mgb_assert(is_exactly<T>(), "type mismatch"); | |||
return reinterpret_cast<T*>(m_holder.get()); | |||
} | |||
template <typename T> | |||
const T* as() const { | |||
mgb_assert(is_exactly<T>(), "type mismatch"); | |||
return reinterpret_cast<const T*>(m_holder.get()); | |||
} | |||
template <typename T> | |||
bool is_exactly() const { | |||
return std::type_index{typeid(T)} == std::type_index{*m_type}; | |||
} | |||
const std::type_info& type() const { | |||
return *m_type; | |||
} | |||
bool operator==(std::nullptr_t nptr) const { | |||
return m_holder == nullptr; | |||
} | |||
operator bool() const { | |||
return m_holder != nullptr; | |||
} | |||
}; | |||
class Profiler { | |||
public: | |||
@@ -57,7 +104,10 @@ public: | |||
uint64_t id; | |||
std::thread::id tid; | |||
profiler::Time time; | |||
std::any data; | |||
AnyPtr data; | |||
Record() = default; | |||
Record(uint64_t id, std::thread::id tid, profiler::Time time, AnyPtr data): | |||
id{id}, tid{tid}, time{time}, data{std::move(data)} {}; | |||
}; | |||
enum Status: uint8_t { | |||
Running = 0, | |||
@@ -82,36 +132,52 @@ private: | |||
std::thread::id m_thread_id; | |||
std::vector<Record> m_records; | |||
std::atomic<Status> m_status = Running; | |||
std::unordered_map<std::type_index, AnyPtr> m_mem_pools; | |||
static std::vector<entry_t> sm_records; | |||
static options_t sm_profile_options; | |||
static std::mutex sm_mutex; | |||
static std::unordered_map<std::thread::id, Profiler*> sm_profilers; | |||
// assume std::thread::id is unique | |||
static std::unordered_map<std::thread::id, std::unique_ptr<Profiler>> sm_profilers; | |||
static Timer sm_timer; | |||
static profiler::HostTime sm_start_at; | |||
static std::atomic_uint64_t sm_last_id; | |||
static std::atomic_size_t sm_preferred_capacity; | |||
static bool sm_profiling; | |||
static constexpr bool sm_debug = false; | |||
thread_local static std::unique_ptr<Profiler> tm_profiler; | |||
thread_local static Profiler* tm_profiler; | |||
public: | |||
Profiler() { | |||
m_thread_id = std::this_thread::get_id(); | |||
MGB_LOCK_GUARD(sm_mutex); | |||
mgb_assert(sm_profilers.count(m_thread_id) == 0); | |||
sm_profilers[m_thread_id] = this; | |||
} | |||
~Profiler() { | |||
MGB_LOCK_GUARD(sm_mutex); | |||
mgb_assert(sm_profilers.count(m_thread_id) == 1); | |||
sm_profilers.erase(m_thread_id); | |||
sm_records.insert(sm_records.end(), m_records.begin(), m_records.end()); | |||
explicit Profiler(std::thread::id tid): m_thread_id{tid} { | |||
mgb_assert(tid == std::this_thread::get_id(), "thread id mismatch"); | |||
} | |||
public: | |||
static Profiler& get_instance() { | |||
if (!tm_profiler) { | |||
MGB_LOCK_GUARD(sm_mutex); | |||
auto& profiler = sm_profilers[std::this_thread::get_id()]; | |||
if (!profiler) { | |||
profiler = std::make_unique<Profiler>(std::this_thread::get_id()); | |||
} | |||
tm_profiler = profiler.get(); | |||
} | |||
return *tm_profiler; | |||
} | |||
template <typename T> | |||
static MemPool<T>& get_mem_pool() { | |||
thread_local MemPool<T>* t_pool = nullptr; | |||
if (t_pool == nullptr) { | |||
auto& pool = get_instance().m_mem_pools[typeid(MemPool<T>)]; | |||
if (pool == nullptr) { | |||
pool = AnyPtr(new MemPool<T>(), {nullptr, [](void*, void* ptr){ | |||
delete reinterpret_cast<MemPool<T>*>(ptr); | |||
}}); | |||
} | |||
t_pool = pool.as<MemPool<T>>(); | |||
} | |||
return *t_pool; | |||
} | |||
static uint64_t next_id() { | |||
return sm_last_id++; | |||
} | |||
@@ -119,13 +185,19 @@ public: | |||
template <typename T, typename... TArgs> | |||
static uint64_t record(TArgs&&... args) { | |||
auto& profiler = get_instance(); | |||
auto& mem_pool = get_mem_pool<T>(); | |||
if constexpr (sm_debug) { | |||
Status expected = Running; | |||
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); | |||
} | |||
uint64_t id = next_id(); | |||
profiler::Time time = sm_timer.record_host(); | |||
profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward<TArgs>(args)...}}); | |||
auto deleter = [](void* obj, void* ptr){ | |||
reinterpret_cast<MemPool<T>*>(obj)->free(reinterpret_cast<T*>(ptr)); | |||
}; | |||
profiler.m_records.emplace_back(id, profiler.m_thread_id, time, AnyPtr{ | |||
mem_pool.alloc(T{std::forward<TArgs>(args)...}), {&mem_pool, deleter} | |||
}); | |||
if constexpr (sm_debug) { | |||
Status expected = Recording; | |||
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); | |||
@@ -146,7 +218,9 @@ public: | |||
std::vector<entry_t> profile_data = std::move(sm_records); | |||
for (auto&& [tid, profiler]: sm_profilers) { | |||
sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); | |||
profile_data.insert(profile_data.end(), profiler->m_records.begin(), profiler->m_records.end()); | |||
profile_data.insert(profile_data.end(), | |||
std::make_move_iterator(profiler->m_records.begin()), | |||
std::make_move_iterator(profiler->m_records.end())); | |||
profiler->m_records.clear(); | |||
profiler->m_records.reserve(sm_preferred_capacity); | |||
} | |||
@@ -160,11 +234,11 @@ public: | |||
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); | |||
} | |||
} | |||
bundle.entries = profile_data; | |||
bundle.entries = std::move(profile_data); | |||
bundle.options = get_options(); | |||
bundle.start_at = sm_start_at; | |||
bundle.thread_dict = get_thread_dict(); | |||
return bundle; | |||
return std::move(bundle); | |||
} | |||
static option_t get_option(std::string key, option_t default_val) { | |||
@@ -203,31 +277,6 @@ public: | |||
}; | |||
class ProfileDataCollector { | |||
public: | |||
template <typename T> | |||
using SubCollector = std::function<void(uint64_t, std::thread::id, uint64_t, T)>; | |||
private: | |||
std::unordered_map<std::type_index, SubCollector<std::any>> m_collectors; | |||
public: | |||
template <typename T> | |||
ProfileDataCollector& handle(SubCollector<T> collector) { | |||
auto erased = [collector](uint64_t id, std::thread::id tid, uint64_t time, std::any data){ | |||
collector(id, tid, time, std::any_cast<T>(std::move(data))); | |||
}; | |||
m_collectors[typeid(T)] = erased; | |||
return *this; | |||
} | |||
void operator()(uint64_t id, std::thread::id tid, uint64_t time, std::any event) { | |||
std::type_index type = event.type(); | |||
if (m_collectors.count(type) == 0) { | |||
return; | |||
} | |||
auto& handler = m_collectors.at(type); | |||
handler(id, tid, time, std::move(event)); | |||
} | |||
}; | |||
#define MGB_RECORD_EVENT(type, ...) \ | |||
if (mgb::imperative::Profiler::is_profiling()) { \ | |||
mgb::imperative::Profiler::record<type>(type{__VA_ARGS__}); \ | |||
@@ -25,11 +25,11 @@ TEST(TestProfiler, ImperativeLogProfile) { | |||
imperative_log_profile("XXX"); | |||
auto results = imperative::Profiler::collect(); | |||
imperative::Profiler::stop_profile(); | |||
mgb_assert(results.size() == 2); | |||
auto* event_start = std::any_cast<profiler::CustomEvent>(&results[0].second.data); | |||
auto* event_finish = std::any_cast<profiler::CustomFinishEvent>(&results[1].second.data); | |||
mgb_assert(results.entries.size() == 2); | |||
auto* event_start = results.entries[0].data.as<profiler::CustomEvent>(); | |||
auto* event_finish = results.entries[1].data.as<profiler::CustomFinishEvent>(); | |||
mgb_assert(event_start && event_start->title == "XXX"); | |||
mgb_assert(event_finish && event_finish->title == "XXX"); | |||
mgb_assert(results[0].second.time < results[1].second.time); | |||
mgb_assert(results[0].second.id < results[1].second.id); | |||
mgb_assert(results.entries[0].time < results.entries[1].time); | |||
mgb_assert(results.entries[0].id < results.entries[1].id); | |||
} |