Browse Source

perf(profiler): reduce profiler host overhead

GitOrigin-RevId: 92cea560f5
release-1.6
Megvii Engine Team 3 years ago
parent
commit
d4c71f92dc
8 changed files with 126 additions and 73 deletions
  1. +4
    -3
      imperative/python/src/tensor.cpp
  2. +2
    -6
      imperative/src/impl/interpreter/interpreter_impl.cpp
  3. +4
    -4
      imperative/src/impl/profiler.cpp
  4. +9
    -3
      imperative/src/impl/profiler/chrome_timeline.cpp
  5. +1
    -1
      imperative/src/impl/profiler/memory_chunk.cpp
  6. +8
    -7
      imperative/src/impl/profiler/states.h
  7. +93
    -44
      imperative/src/include/megbrain/imperative/profiler.h
  8. +5
    -5
      imperative/src/test/profiler.cpp

+ 4
- 3
imperative/python/src/tensor.cpp View File

@@ -1012,9 +1012,10 @@ void init_tensor(py::module m) {
interpreter_for_py->stop_profile(); interpreter_for_py->stop_profile();
interpreter_for_py->sync(); interpreter_for_py->sync();
imperative::Profiler::stop_profile(); imperative::Profiler::stop_profile();
auto results = imperative::Profiler::collect();
return [results=std::move(results)](std::string basename, std::string format){
imperative::Profiler::dump_profile(basename, format, results);
auto results = std::make_shared<imperative::Profiler::bundle_t>(imperative::Profiler::collect());
return [results=results](std::string basename, std::string format) mutable {
imperative::Profiler::dump_profile(basename, format, std::move(*results));
results = nullptr;
}; };
}, py::call_guard<py::gil_scoped_release>()); }, py::call_guard<py::gil_scoped_release>());
m.def("sync", m.def("sync",


+ 2
- 6
imperative/src/impl/interpreter/interpreter_impl.cpp View File

@@ -1133,9 +1133,7 @@ void ChannelImpl::process_one_task(Command& icmd) {
} }
} }
CompNode::foreach([&](CompNode device){ CompNode::foreach([&](CompNode device){
if (Profiler::get_option("sample_rate", 0)) {
sample_on_device(device, true);
}
sample_on_device(device, true);
MGB_RECORD_EVENT_IF((Profiler::get_option("profile_device", 0)), RecordDeviceEvent, Timer::record_device(device)); MGB_RECORD_EVENT_IF((Profiler::get_option("profile_device", 0)), RecordDeviceEvent, Timer::record_device(device));
}); });
MGB_RECORD_EVENT(StartProfileFinishEvent); MGB_RECORD_EVENT(StartProfileFinishEvent);
@@ -1149,9 +1147,7 @@ void ChannelImpl::process_one_task(Command& icmd) {
MGB_RECORD_EVENT(TensorEraseEvent, info->id); MGB_RECORD_EVENT(TensorEraseEvent, info->id);
} }
CompNode::foreach([&](CompNode device){ CompNode::foreach([&](CompNode device){
if (Profiler::get_option("sample_rate", 0)) {
sample_on_device(device, true);
}
sample_on_device(device, true);
}); });
MGB_RECORD_EVENT(StopProfileFinishEvent); MGB_RECORD_EVENT(StopProfileFinishEvent);
} else if constexpr (std::is_same_v<T, PushScope>) { } else if constexpr (std::is_same_v<T, PushScope>) {


+ 4
- 4
imperative/src/impl/profiler.cpp View File

@@ -28,7 +28,7 @@ namespace mgb {
namespace imperative { namespace imperative {


profiler::Time Timer::record_host() { profiler::Time Timer::record_host() {
return std::chrono::high_resolution_clock::now();
return std::chrono::system_clock::now();
} }


std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) { std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) {
@@ -40,12 +40,12 @@ std::shared_ptr<CompNode::Event> Timer::record_device(CompNode device) {
std::vector<Profiler::entry_t> Profiler::sm_records; std::vector<Profiler::entry_t> Profiler::sm_records;
Profiler::options_t Profiler::sm_profile_options; Profiler::options_t Profiler::sm_profile_options;
std::mutex Profiler::sm_mutex; std::mutex Profiler::sm_mutex;
std::unordered_map<std::thread::id, Profiler*> Profiler::sm_profilers;
std::unordered_map<std::thread::id, std::unique_ptr<Profiler>> Profiler::sm_profilers;
Timer Profiler::sm_timer; Timer Profiler::sm_timer;
profiler::HostTime Profiler::sm_start_at = profiler::HostTime::min(); profiler::HostTime Profiler::sm_start_at = profiler::HostTime::min();
std::atomic_uint64_t Profiler::sm_last_id = 0; std::atomic_uint64_t Profiler::sm_last_id = 0;
bool Profiler::sm_profiling = false; bool Profiler::sm_profiling = false;
thread_local std::unique_ptr<Profiler> Profiler::tm_profiler = std::make_unique<Profiler>();
thread_local Profiler* Profiler::tm_profiler = nullptr;
std::atomic_size_t Profiler::sm_preferred_capacity; std::atomic_size_t Profiler::sm_preferred_capacity;


auto Profiler::get_thread_dict() -> thread_dict_t { auto Profiler::get_thread_dict() -> thread_dict_t {
@@ -65,7 +65,7 @@ void Profiler::dump_profile(std::string basename, std::string format, bundle_t r
if (iter == format_table.end()) { if (iter == format_table.end()) {
mgb_log_error("unsupported profiling format %s", format.c_str()); mgb_log_error("unsupported profiling format %s", format.c_str());
} }
return (iter->second)(basename, result);
return (iter->second)(basename, std::move(result));
} }


} // namespace imperative } // namespace imperative


+ 9
- 3
imperative/src/impl/profiler/chrome_timeline.cpp View File

@@ -268,7 +268,7 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
.cat("Kernel") .cat("Kernel")
.args(current_op->detail()); .args(current_op->detail());
} else if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) { } else if constexpr (std::is_same_v<TEvent, TensorProduceEvent>) {
if (current_tensor->living_time != profiler::Duration::zero()) {
if (current_tensor->living_time == profiler::Duration::zero()) {
new_host_event(pid_str, 's') new_host_event(pid_str, 's')
.id(event.tensor_id) .id(event.tensor_id)
.cat("TensorLink") .cat("TensorLink")
@@ -319,8 +319,8 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
.scope(pid_str); .scope(pid_str);
} else if constexpr (std::is_same_v<TEvent, SampleDeviceFinishEvent>) { } else if constexpr (std::is_same_v<TEvent, SampleDeviceFinishEvent>) {
std::string device_name = event.device.locator().to_string(); std::string device_name = event.device.locator().to_string();
new_host_event("memory", 'C')
.arg(ssprintf("%s_alloc_mem", device_name.c_str()), event.total_memory - event.free_memory);
new_host_event(ssprintf("%s_alloc_mem", device_name.c_str()), 'C')
.arg("value", event.total_memory - event.free_memory);
} else if constexpr (std::is_same_v<TEvent, TensorCommandEvent>) { } else if constexpr (std::is_same_v<TEvent, TensorCommandEvent>) {
new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B'); new_host_event(ssprintf("%s %zu", to_cstr(event.kind), event.tensor_id), 'B');
} else if constexpr (std::is_same_v<TEvent, TensorCommandFinishEvent>) { } else if constexpr (std::is_same_v<TEvent, TensorCommandFinishEvent>) {
@@ -366,6 +366,12 @@ struct ChromeTimelineEventVisitor: EventVisitor<ChromeTimelineEventVisitor> {
.arg("dtype", event.layout.dtype.name()) .arg("dtype", event.layout.dtype.name())
.arg("nr_elements", event.layout.total_nr_elems()) .arg("nr_elements", event.layout.total_nr_elems())
.arg("device", event.device.to_string()); .arg("device", event.device.to_string());
} else if constexpr (std::is_same_v<TEvent, RecordDeviceEvent>) {
auto current_host_time = current->time;
auto current_device_time = to_device_time(current->time, event.event->comp_node());
auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>(current_device_time-current_host_time);
new_host_event("device_ahead_ms", 'C')
.arg("value", device_ahead.count());
} }
} }




+ 1
- 1
imperative/src/impl/profiler/memory_chunk.cpp View File

@@ -261,7 +261,7 @@ struct MemoryFlowVisitor: EventVisitor<MemoryFlowVisitor> {


void dump_memory_flow(std::string filename, Profiler::bundle_t result) { void dump_memory_flow(std::string filename, Profiler::bundle_t result) {
MemoryFlowVisitor visitor; MemoryFlowVisitor visitor;
visitor.process_events(std::move(result));
visitor.process_events(result);
debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string()); debug::write_to_file(filename.c_str(), visitor.memory_flow.to_svg().to_string());
} }




+ 8
- 7
imperative/src/impl/profiler/states.h View File

@@ -139,22 +139,22 @@ struct is_trace_event<T, decltype(std::declval<T>().trace, void())> : std::true_
template <typename... TItems> template <typename... TItems>
class AnyToVariantConverter { class AnyToVariantConverter {
public: public:
using any_t = std::any;
using any_t = AnyPtr;
using variant_t = std::variant<TItems...>; using variant_t = std::variant<TItems...>;
private: private:
std::unordered_map<std::type_index, std::function<variant_t(any_t)>> m_table;
std::unordered_map<std::type_index, std::function<variant_t(const any_t&)>> m_table;


template <typename TItem> template <typename TItem>
void register_converter() { void register_converter() {
m_table[typeid(TItem)] = [](any_t input) {
return variant_t(std::any_cast<TItem>(std::move(input)));
m_table[typeid(TItem)] = [](const any_t& input) {
return variant_t(*input.as<TItem>());
}; };
} }
public: public:
AnyToVariantConverter() { AnyToVariantConverter() {
(register_converter<TItems>(), ...); (register_converter<TItems>(), ...);
} }
variant_t operator()(any_t input) {
variant_t operator()(const any_t& input) {
return m_table[input.type()](std::move(input)); return m_table[input.type()](std::move(input));
} }
}; };
@@ -222,7 +222,7 @@ protected:
value += delta; value += delta;
} }
public: public:
void process_events(Profiler::bundle_t bundle) {
void process_events(Profiler::bundle_t& bundle) {
m_start_time = bundle.start_at; m_start_time = bundle.start_at;


auto& self = static_cast<TSelf&>(*this); auto& self = static_cast<TSelf&>(*this);
@@ -231,7 +231,7 @@ public:
OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent, OpInputEvent, OpInputFinishEvent, OpOutputEvent, OpOutputFinishEvent,
TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent, TensorDeclareEvent, TensorProduceEvent, TensorUsageEvent, TensorReleaseEvent, TensorEraseEvent,
TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent, TensorGetPropEvent, TensorNotifyPropEvent, TensorWaitPropEvent, TensorWaitPropFinishEvent,
SampleDeviceEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent,
SampleDeviceEvent, SampleDeviceFinishEvent, WorkerExceptionEvent, ShapeInferEvent, SyncEvent, SyncFinishEvent,
StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent, StartProfileEvent, StartProfileFinishEvent, StopProfileEvent, StopProfileFinishEvent,
TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent, TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, AutoEvictFinishEvent,
CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, ScopeEvent, ScopeFinishEvent,
@@ -298,6 +298,7 @@ public:
m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()}; m_device_tid_table[event.device] = {m_device_tid_table.size() + m_host_tid_table.size()};
} }
tensor.device = event.device; tensor.device = event.device;
tensor.layout = event.layout;
} }
}); });




+ 93
- 44
imperative/src/include/megbrain/imperative/profiler.h View File

@@ -34,9 +34,10 @@ namespace imperative {


namespace profiler { namespace profiler {


using HostTime = std::chrono::time_point<std::chrono::high_resolution_clock>;
using HostTime = std::chrono::time_point<std::chrono::system_clock>;


using Duration = std::chrono::nanoseconds; using Duration = std::chrono::nanoseconds;

using RealDuration = std::chrono::duration<double, std::nano>; using RealDuration = std::chrono::duration<double, std::nano>;


using Time = HostTime; using Time = HostTime;
@@ -50,6 +51,52 @@ public:
static std::shared_ptr<CompNode::Event> record_device(CompNode device); static std::shared_ptr<CompNode::Event> record_device(CompNode device);
}; };


class AnyPtr {
public:
struct Deleter {
void* object;
void (*method)(void*, void*);
void operator() (void* ptr) {
method(object, ptr);
}
};
private:
using holder_t = std::unique_ptr<void, Deleter>;

const std::type_info* m_type = nullptr;
holder_t m_holder = nullptr;
public:
AnyPtr() = default;
template <typename T, typename=std::enable_if_t<!std::is_same_v<std::decay_t<T>, AnyPtr>>>
explicit AnyPtr(T* value, Deleter deleter) {
m_type = &typeid(T);
m_holder = {value, deleter};
}
template <typename T>
T* as() {
mgb_assert(is_exactly<T>(), "type mismatch");
return reinterpret_cast<T*>(m_holder.get());
}
template <typename T>
const T* as() const {
mgb_assert(is_exactly<T>(), "type mismatch");
return reinterpret_cast<const T*>(m_holder.get());
}
template <typename T>
bool is_exactly() const {
return std::type_index{typeid(T)} == std::type_index{*m_type};
}
const std::type_info& type() const {
return *m_type;
}
bool operator==(std::nullptr_t nptr) const {
return m_holder == nullptr;
}
operator bool() const {
return m_holder != nullptr;
}
};



class Profiler { class Profiler {
public: public:
@@ -57,7 +104,10 @@ public:
uint64_t id; uint64_t id;
std::thread::id tid; std::thread::id tid;
profiler::Time time; profiler::Time time;
std::any data;
AnyPtr data;
Record() = default;
Record(uint64_t id, std::thread::id tid, profiler::Time time, AnyPtr data):
id{id}, tid{tid}, time{time}, data{std::move(data)} {};
}; };
enum Status: uint8_t { enum Status: uint8_t {
Running = 0, Running = 0,
@@ -82,36 +132,52 @@ private:
std::thread::id m_thread_id; std::thread::id m_thread_id;
std::vector<Record> m_records; std::vector<Record> m_records;
std::atomic<Status> m_status = Running; std::atomic<Status> m_status = Running;
std::unordered_map<std::type_index, AnyPtr> m_mem_pools;


static std::vector<entry_t> sm_records; static std::vector<entry_t> sm_records;
static options_t sm_profile_options; static options_t sm_profile_options;
static std::mutex sm_mutex; static std::mutex sm_mutex;
static std::unordered_map<std::thread::id, Profiler*> sm_profilers;
// assume std::thread::id is unique
static std::unordered_map<std::thread::id, std::unique_ptr<Profiler>> sm_profilers;
static Timer sm_timer; static Timer sm_timer;
static profiler::HostTime sm_start_at; static profiler::HostTime sm_start_at;
static std::atomic_uint64_t sm_last_id; static std::atomic_uint64_t sm_last_id;
static std::atomic_size_t sm_preferred_capacity; static std::atomic_size_t sm_preferred_capacity;
static bool sm_profiling; static bool sm_profiling;
static constexpr bool sm_debug = false; static constexpr bool sm_debug = false;
thread_local static std::unique_ptr<Profiler> tm_profiler;
thread_local static Profiler* tm_profiler;
public: public:
Profiler() {
m_thread_id = std::this_thread::get_id();
MGB_LOCK_GUARD(sm_mutex);
mgb_assert(sm_profilers.count(m_thread_id) == 0);
sm_profilers[m_thread_id] = this;
}
~Profiler() {
MGB_LOCK_GUARD(sm_mutex);
mgb_assert(sm_profilers.count(m_thread_id) == 1);
sm_profilers.erase(m_thread_id);
sm_records.insert(sm_records.end(), m_records.begin(), m_records.end());
explicit Profiler(std::thread::id tid): m_thread_id{tid} {
mgb_assert(tid == std::this_thread::get_id(), "thread id mismatch");
} }
public: public:
static Profiler& get_instance() { static Profiler& get_instance() {
if (!tm_profiler) {
MGB_LOCK_GUARD(sm_mutex);
auto& profiler = sm_profilers[std::this_thread::get_id()];
if (!profiler) {
profiler = std::make_unique<Profiler>(std::this_thread::get_id());
}
tm_profiler = profiler.get();
}
return *tm_profiler; return *tm_profiler;
} }


template <typename T>
static MemPool<T>& get_mem_pool() {
thread_local MemPool<T>* t_pool = nullptr;
if (t_pool == nullptr) {
auto& pool = get_instance().m_mem_pools[typeid(MemPool<T>)];
if (pool == nullptr) {
pool = AnyPtr(new MemPool<T>(), {nullptr, [](void*, void* ptr){
delete reinterpret_cast<MemPool<T>*>(ptr);
}});
}
t_pool = pool.as<MemPool<T>>();
}
return *t_pool;
}

static uint64_t next_id() { static uint64_t next_id() {
return sm_last_id++; return sm_last_id++;
} }
@@ -119,13 +185,19 @@ public:
template <typename T, typename... TArgs> template <typename T, typename... TArgs>
static uint64_t record(TArgs&&... args) { static uint64_t record(TArgs&&... args) {
auto& profiler = get_instance(); auto& profiler = get_instance();
auto& mem_pool = get_mem_pool<T>();
if constexpr (sm_debug) { if constexpr (sm_debug) {
Status expected = Running; Status expected = Running;
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording)); mgb_assert(profiler.m_status.compare_exchange_strong(expected, Recording));
} }
uint64_t id = next_id(); uint64_t id = next_id();
profiler::Time time = sm_timer.record_host(); profiler::Time time = sm_timer.record_host();
profiler.m_records.push_back({id, std::this_thread::get_id(), time, T{std::forward<TArgs>(args)...}});
auto deleter = [](void* obj, void* ptr){
reinterpret_cast<MemPool<T>*>(obj)->free(reinterpret_cast<T*>(ptr));
};
profiler.m_records.emplace_back(id, profiler.m_thread_id, time, AnyPtr{
mem_pool.alloc(T{std::forward<TArgs>(args)...}), {&mem_pool, deleter}
});
if constexpr (sm_debug) { if constexpr (sm_debug) {
Status expected = Recording; Status expected = Recording;
mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running)); mgb_assert(profiler.m_status.compare_exchange_strong(expected, Running));
@@ -146,7 +218,9 @@ public:
std::vector<entry_t> profile_data = std::move(sm_records); std::vector<entry_t> profile_data = std::move(sm_records);
for (auto&& [tid, profiler]: sm_profilers) { for (auto&& [tid, profiler]: sm_profilers) {
sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size()); sm_preferred_capacity = std::max(sm_preferred_capacity.load(), profiler->m_records.size());
profile_data.insert(profile_data.end(), profiler->m_records.begin(), profiler->m_records.end());
profile_data.insert(profile_data.end(),
std::make_move_iterator(profiler->m_records.begin()),
std::make_move_iterator(profiler->m_records.end()));
profiler->m_records.clear(); profiler->m_records.clear();
profiler->m_records.reserve(sm_preferred_capacity); profiler->m_records.reserve(sm_preferred_capacity);
} }
@@ -160,11 +234,11 @@ public:
mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running)); mgb_assert(profiler->m_status.compare_exchange_strong(expected, Running));
} }
} }
bundle.entries = profile_data;
bundle.entries = std::move(profile_data);
bundle.options = get_options(); bundle.options = get_options();
bundle.start_at = sm_start_at; bundle.start_at = sm_start_at;
bundle.thread_dict = get_thread_dict(); bundle.thread_dict = get_thread_dict();
return bundle;
return std::move(bundle);
} }


static option_t get_option(std::string key, option_t default_val) { static option_t get_option(std::string key, option_t default_val) {
@@ -203,31 +277,6 @@ public:
}; };




class ProfileDataCollector {
public:
template <typename T>
using SubCollector = std::function<void(uint64_t, std::thread::id, uint64_t, T)>;
private:
std::unordered_map<std::type_index, SubCollector<std::any>> m_collectors;
public:
template <typename T>
ProfileDataCollector& handle(SubCollector<T> collector) {
auto erased = [collector](uint64_t id, std::thread::id tid, uint64_t time, std::any data){
collector(id, tid, time, std::any_cast<T>(std::move(data)));
};
m_collectors[typeid(T)] = erased;
return *this;
}
void operator()(uint64_t id, std::thread::id tid, uint64_t time, std::any event) {
std::type_index type = event.type();
if (m_collectors.count(type) == 0) {
return;
}
auto& handler = m_collectors.at(type);
handler(id, tid, time, std::move(event));
}
};

#define MGB_RECORD_EVENT(type, ...) \ #define MGB_RECORD_EVENT(type, ...) \
if (mgb::imperative::Profiler::is_profiling()) { \ if (mgb::imperative::Profiler::is_profiling()) { \
mgb::imperative::Profiler::record<type>(type{__VA_ARGS__}); \ mgb::imperative::Profiler::record<type>(type{__VA_ARGS__}); \


+ 5
- 5
imperative/src/test/profiler.cpp View File

@@ -25,11 +25,11 @@ TEST(TestProfiler, ImperativeLogProfile) {
imperative_log_profile("XXX"); imperative_log_profile("XXX");
auto results = imperative::Profiler::collect(); auto results = imperative::Profiler::collect();
imperative::Profiler::stop_profile(); imperative::Profiler::stop_profile();
mgb_assert(results.size() == 2);
auto* event_start = std::any_cast<profiler::CustomEvent>(&results[0].second.data);
auto* event_finish = std::any_cast<profiler::CustomFinishEvent>(&results[1].second.data);
mgb_assert(results.entries.size() == 2);
auto* event_start = results.entries[0].data.as<profiler::CustomEvent>();
auto* event_finish = results.entries[1].data.as<profiler::CustomFinishEvent>();
mgb_assert(event_start && event_start->title == "XXX"); mgb_assert(event_start && event_start->title == "XXX");
mgb_assert(event_finish && event_finish->title == "XXX"); mgb_assert(event_finish && event_finish->title == "XXX");
mgb_assert(results[0].second.time < results[1].second.time);
mgb_assert(results[0].second.id < results[1].second.id);
mgb_assert(results.entries[0].time < results.entries[1].time);
mgb_assert(results.entries[0].id < results.entries[1].id);
} }

Loading…
Cancel
Save