@@ -29,6 +29,7 @@ endif() | |||
include(GNUInstallDirs) | |||
include(CheckCXXCompilerFlag) | |||
include(CheckIPOSupported) | |||
include(CMakeDependentOption) | |||
check_cxx_compiler_flag(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS) | |||
@@ -97,6 +98,12 @@ option(MGE_BUILD_WITH_ASAN "Enable build with ASAN, need compiler support" OFF) | |||
option(MGE_WITH_CUSTOM_OP "Build with Custom op" OFF) | |||
option(MGE_SYNC_THIRD_PARTY "help sync third_party submodule" OFF) | |||
# TODO: add windows support | |||
cmake_dependent_option(MGE_WITH_CUPTI "Build with CUPTI" ON | |||
"MGE_WITH_CUDA;MGE_BUILD_IMPERATIVE_RT;NOT MSVC;NOT WIN32" OFF) | |||
set(MGB_CUPTI ${MGE_WITH_CUPTI}) | |||
if(MSVC OR WIN32) | |||
# FIXME: static link Windows vc runtime with some version from Visual Studio have some | |||
# runtime issue at some call PATH, for example: _imperative_rt.pyd --> | |||
@@ -686,6 +693,10 @@ if(MGB_WITH_FLATBUFFERS) | |||
include(cmake/flatbuffers.cmake) | |||
endif() | |||
if(MGE_WITH_CUPTI) | |||
include(cmake/cupti.cmake) | |||
endif() | |||
if(MGE_WITH_CUDA) | |||
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) | |||
foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) | |||
@@ -6,7 +6,7 @@ endif() | |||
if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") | |||
set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | |||
endif() | |||
message("CUDNN ROOT: " ${CUDNN_ROOT_DIR}) | |||
message(STATUS "CUDNN ROOT: ${CUDNN_ROOT_DIR}") | |||
if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) | |||
find_library( | |||
CUDNN_LIBRARY | |||
@@ -0,0 +1,85 @@ | |||
if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_ROOT_DIR}" STREQUAL "") | |||
set(CUDA_ROOT_DIR $ENV{CUDA_ROOT_DIR}) | |||
endif() | |||
if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_PATH}" STREQUAL "") | |||
set(CUDA_ROOT_DIR $ENV{CUDA_PATH}) | |||
endif() | |||
if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_BIN_PATH}" STREQUAL "") | |||
set(CUDA_ROOT_DIR $ENV{CUDA_BIN_PATH}) | |||
endif() | |||
if("${CUDA_ROOT_DIR}" STREQUAL "") | |||
message( | |||
FATAL_ERROR | |||
"Can not find CUDA, please export cuda sdk path to CUDA_ROOT_DIR or CUDA_PATH or CUDA_BIN_PATH" | |||
) | |||
endif() | |||
# TODO: find_library(CUDA_ROOT_DIR) in cmake/cuda.cmake | |||
set(MGE_CUPTI_USE_STATIC ${MGE_CUDA_USE_STATIC}) | |||
# relates https://stackoverflow.com/questions/67485114 | |||
if(${MGE_CUDA_USE_STATIC} AND ${CXX_SUPPORT_GOLD}) | |||
message(WARNING "static linking CuPTI with gold may break exception handling,\ | |||
use shared one instead") | |||
set(MGE_CUPTI_USE_STATIC OFF) | |||
endif() | |||
if(MGE_CUPTI_USE_STATIC) | |||
find_library( | |||
CUPTI_LIBRARY | |||
NAMES libcupti_static.a | |||
HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||
PATH_SUFFIXES lib lib64 | |||
DOC "CuPTI library.") | |||
if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND") | |||
message(WARNING "Can not find static CuPTI Library, use shared one instead") | |||
set(MGE_CUPTI_USE_STATIC OFF) | |||
endif() | |||
endif() | |||
if(NOT ${MGE_CUPTI_USE_STATIC}) | |||
find_library( | |||
CUPTI_LIBRARY | |||
NAMES libcupti.so | |||
HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||
PATH_SUFFIXES lib lib64 | |||
DOC "CuPTI library.") | |||
set(CUPTI_LIBRARY_TYPE SHARED) | |||
else() | |||
set(CUPTI_LIBRARY_TYPE STATIC) | |||
endif() | |||
if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND") | |||
message(FATAL_ERROR "Can not find CuPTI Library") | |||
endif() | |||
find_path( | |||
CUPTI_INCLUDE_DIR | |||
NAMES cupti.h | |||
HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||
PATH_SUFFIXES include | |||
DOC "Path to CuPTI include directory.") | |||
if(CUPTI_INCLUDE_DIR STREQUAL "CUPTI_INCLUDE_DIR-NOTFOUND") | |||
message(FATAL_ERROR "Can not find CuPTI INCLUDE") | |||
endif() | |||
if(EXISTS ${CUPTI_INCLUDE_DIR}/cupti_version.h) | |||
file(READ ${CUPTI_INCLUDE_DIR}/cupti_version.h CUPTI_VERSION_FILE_CONTENTS) | |||
else() | |||
file(READ ${CUPTI_INCLUDE_DIR}/cupti.h CUPTI_VERSION_FILE_CONTENTS) | |||
endif() | |||
string(REGEX MATCH "define CUPTI_API_VERSION * +([0-9]+)" CUPTI_API_VERSION | |||
"${CUPTI_VERSION_FILE_CONTENTS}") | |||
string(REGEX REPLACE "define CUPTI_API_VERSION * +([0-9]+)" "\\1" CUPTI_API_VERSION | |||
"${CUPTI_API_VERSION}") | |||
add_library(libcupti ${CUPTI_LIBRARY_TYPE} IMPORTED) | |||
set_target_properties( | |||
libcupti PROPERTIES IMPORTED_LOCATION ${CUPTI_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES | |||
${CUPTI_INCLUDE_DIR}) | |||
message(STATUS "Found CuPTI: ${CUPTI_LIBRARY} (found version: ${CUPTI_API_VERSION})") |
@@ -36,7 +36,7 @@ else() | |||
PATH_SUFFIXES lib lib64 | |||
DOC "TRT plugin library.") | |||
endif() | |||
message("TRT_LIBRARY" ${TRT_LIBRARY}) | |||
message(STATUS "TRT_LIBRARY: ${TRT_LIBRARY}") | |||
if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND") | |||
message( | |||
FATAL_ERROR | |||
@@ -51,6 +51,10 @@ if(ANDROID) | |||
target_link_libraries(${MODULE_NAME} PRIVATE ${PYTHON_LIBRARIES}) | |||
endif() | |||
if(MGE_WITH_CUPTI) | |||
target_link_libraries(${MODULE_NAME} PRIVATE libcupti) | |||
endif() | |||
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 | |||
${PROJECT_BINARY_DIR}/third_party/range-v3) | |||
target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | |||
@@ -16,6 +16,10 @@ from weakref import WeakSet | |||
from .. import _atexit | |||
from ..core._imperative_rt.core2 import ( | |||
cupti_available, | |||
disable_cupti, | |||
enable_cupti, | |||
full_sync, | |||
pop_scope, | |||
push_scope, | |||
start_profile, | |||
@@ -50,13 +54,18 @@ class Profiler(ContextDecorator): | |||
with profiler: | |||
# your code here | |||
# Then open the profile file in chrome timeline window | |||
""" | |||
CHROME_TIMELINE = "chrome_timeline.json" | |||
valid_options = {"sample_rate": 0, "profile_device": 1, "num_tensor_watch": 10} | |||
valid_options = { | |||
"sample_rate": 0, | |||
"profile_device": 1, | |||
"num_tensor_watch": 10, | |||
"enable_cupti": 0, | |||
} | |||
valid_formats = {"chrome_timeline.json", "memory_flow.svg"} | |||
def __init__( | |||
@@ -83,6 +92,11 @@ class Profiler(ContextDecorator): | |||
self._options[opt] = int(kwargs.pop(opt, optval)) | |||
self._pid = "<PID>" | |||
self._dump_callback = None | |||
if self._options.get("enable_cupti", 0): | |||
if cupti_available(): | |||
enable_cupti() | |||
else: | |||
get_logger().warning("CuPTI unavailable") | |||
@property | |||
def path(self): | |||
@@ -116,7 +130,7 @@ class Profiler(ContextDecorator): | |||
assert _running_profiler is self | |||
_running_profiler = None | |||
sync() | |||
full_sync() | |||
self._dump_callback = stop_profile() | |||
self._pid = os.getpid() | |||
_living_profilers.add(self) | |||
@@ -160,6 +174,9 @@ class Profiler(ContextDecorator): | |||
return func | |||
def __del__(self): | |||
if self._options.get("enable_cupti", 0): | |||
if cupti_available(): | |||
disable_cupti() | |||
self.dump() | |||
@@ -11,6 +11,7 @@ | |||
#include "megbrain/common.h" | |||
#include "megbrain/dtype.h" | |||
#include "megbrain/imperative/cpp_cupti.h" | |||
#include "megbrain/imperative/ops/autogen.h" | |||
#include "megbrain/imperative/ops/backward_graph.h" | |||
#include "megbrain/imperative/ops/utility.h" | |||
@@ -982,6 +983,7 @@ void init_tensor(py::module m) { | |||
m.def("stop_profile", [channel]() -> std::function<void(std::string, std::string)> { | |||
channel->stop_profile(); | |||
channel->sync(); | |||
CompNode::sync_all(); | |||
imperative::Profiler::stop_profile(); | |||
auto results = std::make_shared<imperative::Profiler::bundle_t>( | |||
imperative::Profiler::collect()); | |||
@@ -990,6 +992,9 @@ void init_tensor(py::module m) { | |||
results = nullptr; | |||
}; | |||
}); | |||
m.def("enable_cupti", &cupti::enable); | |||
m.def("disable_cupti", &cupti::disable); | |||
m.def("cupti_available", &cupti::available); | |||
m.def("sync", [channel]() { | |||
if (channel->check_available()) { | |||
channel->sync(); | |||
@@ -0,0 +1,273 @@ | |||
#include "megbrain/imperative/cpp_cupti.h" | |||
#include <cinttypes> | |||
#include <cstddef> | |||
#include <cstdlib> | |||
#include "megbrain/exception.h" | |||
#include "megbrain/imperative/profiler.h" | |||
#include "megbrain/imperative/utils/platform.h" | |||
#include "./profiler/events.h" | |||
#if MGB_CUPTI | |||
#include "cupti.h" | |||
#define CUPTI_CALL(call) \ | |||
do { \ | |||
CUptiResult _status = call; \ | |||
if (_status != CUPTI_SUCCESS) { \ | |||
const char* errstr; \ | |||
cuptiGetResultString(_status, &errstr); \ | |||
mgb_assert(_status == CUPTI_SUCCESS, "cupti error: %s", errstr); \ | |||
} \ | |||
} while (0) | |||
#endif | |||
namespace mgb::imperative::cupti { | |||
#if MGB_CUPTI | |||
namespace { | |||
CUpti_SubscriberHandle cuptiSubscriber; | |||
void cuptiSubscriberCallback( | |||
void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id, | |||
const void* cb_info) { | |||
using namespace profiler; | |||
switch (domain) { | |||
case CUPTI_CB_DOMAIN_DRIVER_API: { | |||
auto cb_data = (const CUpti_CallbackData*)cb_info; | |||
switch (cb_id) { | |||
case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: { | |||
if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||
MGB_RECORD_EVENT( | |||
CUPTIKernelLaunchEvent, cb_data->correlationId, | |||
cb_data->symbolName); | |||
} else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||
MGB_RECORD_EVENT( | |||
CUPTIKernelLaunchFinishEvent, cb_data->correlationId, | |||
cb_data->symbolName); | |||
} | |||
break; | |||
} | |||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA: { | |||
} | |||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: { | |||
if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||
MGB_RECORD_EVENT( | |||
CUPTIMemcpyLaunchEvent, cb_data->correlationId); | |||
} else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||
MGB_RECORD_EVENT( | |||
CUPTIMemcpyLaunchFinishEvent, cb_data->correlationId); | |||
} | |||
break; | |||
} | |||
default: { | |||
if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||
MGB_RECORD_EVENT( | |||
CUPTIDriverEvent, cb_data->correlationId, | |||
cb_data->functionName); | |||
} else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||
MGB_RECORD_EVENT( | |||
CUPTIDriverFinishEvent, cb_data->correlationId, | |||
cb_data->functionName); | |||
} | |||
} | |||
} | |||
break; | |||
} | |||
case CUPTI_CB_DOMAIN_RUNTIME_API: { | |||
auto cb_data = (const CUpti_CallbackData*)cb_info; | |||
if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||
MGB_RECORD_EVENT( | |||
CUPTIRuntimeEvent, cb_data->correlationId, | |||
cb_data->functionName); | |||
} else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||
MGB_RECORD_EVENT( | |||
CUPTIRuntimeFinishEvent, cb_data->correlationId, | |||
cb_data->functionName); | |||
} | |||
break; | |||
} | |||
} | |||
} | |||
void handleActivity(CUpti_Activity* record) { | |||
using namespace std::chrono_literals; | |||
auto delta = 16ns; | |||
switch (record->kind) { | |||
case CUPTI_ACTIVITY_KIND_KERNEL: | |||
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { | |||
auto kernel = cupti::activity<CUpti_ActivityKernel4>(record); | |||
MGB_RECORD_EVENT( | |||
profiler::CUPTIKernelExecuteEvent, kernel->correlationId, | |||
kernel->name, kernel.stream(), kernel.start(), | |||
kernel.end() - delta); | |||
break; | |||
} | |||
case CUPTI_ACTIVITY_KIND_MEMCPY: { | |||
auto memcpy = cupti::activity<CUpti_ActivityMemcpy>(record); | |||
MGB_RECORD_EVENT( | |||
profiler::CUPTIMemcpyEvent, memcpy->correlationId, memcpy->srcKind, | |||
memcpy->dstKind, memcpy->bytes, memcpy.stream(), memcpy.start(), | |||
memcpy.end()); | |||
break; | |||
} | |||
case CUPTI_ACTIVITY_KIND_MEMSET: { | |||
auto memset = cupti::activity<CUpti_ActivityMemset>(record); | |||
MGB_RECORD_EVENT( | |||
profiler::CUPTIMemsetEvent, memset->correlationId, memset->value, | |||
memset->bytes, memset.stream(), memset.start(), | |||
memset.end() - delta); | |||
break; | |||
} | |||
default: | |||
break; | |||
} | |||
} | |||
using activity_buffer_t = | |||
std::aligned_storage_t<8 * 1024 * 1024, ACTIVITY_RECORD_ALIGNMENT>; | |||
void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { | |||
*buffer = reinterpret_cast<uint8_t*>(new activity_buffer_t()); | |||
*size = sizeof(activity_buffer_t); | |||
*maxNumRecords = 0; | |||
} | |||
void bufferCompleted( | |||
CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size, | |||
size_t validSize) { | |||
CUptiResult status; | |||
CUpti_Activity* record = NULL; | |||
if (validSize > 0) { | |||
do { | |||
status = cuptiActivityGetNextRecord(buffer, validSize, &record); | |||
if (status == CUPTI_SUCCESS) { | |||
handleActivity(record); | |||
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) | |||
break; | |||
else { | |||
CUPTI_CALL(status); | |||
} | |||
} while (1); | |||
size_t dropped; | |||
CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); | |||
mgb_assert(dropped == 0, "%zu records dropped", dropped); | |||
} | |||
delete reinterpret_cast<activity_buffer_t*>(buffer); | |||
} | |||
static bool initialized = false; | |||
} // namespace | |||
bool available() { | |||
uint32_t compiletime_version = (CUPTI_API_VERSION); | |||
uint32_t runtime_version; | |||
CUPTI_CALL(cuptiGetVersion(&runtime_version)); | |||
if (compiletime_version != runtime_version) { | |||
static std::once_flag once; | |||
std::call_once(once, [&] { | |||
mgb_log_warn( | |||
"CuPTI version %d mismatch against compiletime version %d. " | |||
"This may caused by user config LD_LIBRARY_PATH" | |||
"at unix-like env or config PATH at Windows env", | |||
(int)compiletime_version, (int)runtime_version); | |||
}); | |||
return false; | |||
} | |||
return true; | |||
} | |||
void enable() { | |||
// not thread safe | |||
mgb_assert(!initialized, "cupti already initialized"); | |||
// callback | |||
CUPTI_CALL(cuptiSubscribe( | |||
&cuptiSubscriber, (CUpti_CallbackFunc)cuptiSubscriberCallback, | |||
(void*)nullptr)); | |||
CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_DRIVER_API)); | |||
CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_RUNTIME_API)); | |||
// activity | |||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); | |||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); | |||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); | |||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); | |||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); | |||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); | |||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); | |||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); | |||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); | |||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); | |||
CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); | |||
initialized = true; | |||
} | |||
void disable() { | |||
mgb_assert(initialized, "cupti not initialized yet"); | |||
flush(); | |||
CUPTI_CALL(cuptiFinalize()); | |||
initialized = false; | |||
} | |||
void flush() { | |||
if (initialized) { | |||
CUPTI_CALL(cuptiActivityFlushAll(1)); | |||
} | |||
} | |||
bool enabled() { | |||
return initialized; | |||
} | |||
time_point clock::now() { | |||
uint64_t timestamp; | |||
CUPTI_CALL(cuptiGetTimestamp(×tamp)); | |||
using namespace std::chrono; | |||
// overflow? | |||
return time_point(duration((int64_t)timestamp)); | |||
} | |||
#else | |||
class CuPTIUnavailableError : public MegBrainError { | |||
public: | |||
CuPTIUnavailableError() | |||
: MegBrainError( | |||
#if MGB_CUDA | |||
"CuPTI disabled at compile time" | |||
#else | |||
"CuPTI unsupported on non cuda platform" | |||
#endif | |||
) { | |||
} | |||
}; | |||
bool available() { | |||
return false; | |||
} | |||
void enable() { | |||
throw CuPTIUnavailableError(); | |||
} | |||
void disable() { | |||
throw CuPTIUnavailableError(); | |||
} | |||
void flush() {} | |||
bool enabled() { | |||
return false; | |||
} | |||
time_point clock::now() { | |||
throw CuPTIUnavailableError(); | |||
} | |||
#endif | |||
} // namespace mgb::imperative::cupti |
@@ -12,7 +12,9 @@ | |||
#include "megbrain/imperative/profiler.h" | |||
#include <chrono> | |||
#include <unordered_map> | |||
#include "megbrain/imperative/cpp_cupti.h" | |||
#include "megbrain/imperative/ops/opr_attr.h" | |||
#include "megbrain/imperative/physical_tensor.h" | |||
@@ -48,6 +50,21 @@ bool Profiler::sm_profiling = false; | |||
thread_local Profiler* Profiler::tm_profiler = nullptr; | |||
std::atomic_size_t Profiler::sm_preferred_capacity; | |||
void Profiler::start_profile() { | |||
mgb_assert(!sm_profiling); | |||
sm_start_at = Timer::record_host(); | |||
sm_profiling = true; | |||
if (cupti::enabled()) { | |||
MGB_RECORD_EVENT(profiler::CUPTITimestampEvent, cupti::clock::now()); | |||
} | |||
} | |||
void Profiler::stop_profile() { | |||
mgb_assert(sm_profiling); | |||
cupti::flush(); | |||
sm_profiling = false; | |||
} | |||
auto Profiler::get_thread_dict() -> thread_dict_t { | |||
thread_dict_t thread_dict; | |||
for (auto&& [tid, profiler] : sm_profilers) { | |||
@@ -19,6 +19,7 @@ | |||
#include "nlohmann/json.hpp" | |||
#include "megbrain/imperative/utils/platform.h" | |||
#include "megbrain/utils/debug.h" | |||
#include "./formats.h" | |||
@@ -198,6 +199,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
decltype(getpid()) pid = getpid(); | |||
std::string pid_str = std::to_string(pid); | |||
ChromeTimelineEventVisitor() {} | |||
ChromeTraceEvent& new_event( | |||
std::string name, char ph, size_t tid, profiler::HostTime time) { | |||
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts( | |||
@@ -213,8 +216,13 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
.ts(since_start(current->time)); | |||
} | |||
ChromeTraceEvent& new_cupti_event( | |||
std::string name, char ph, cupti::stream_t stream, | |||
cupti::time_point timestamp) { | |||
return new_event(name, ph, to_tid(stream), time_from_cupti(timestamp)); | |||
} | |||
ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | |||
using namespace std::literals::chrono_literals; | |||
auto time = since_start(to_device_time(current->time, device)); | |||
return trace_events.new_event() | |||
.name(name) | |||
@@ -391,6 +399,80 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>( | |||
current_device_time - current_host_time); | |||
new_host_event("device_ahead_ms", 'C').arg("value", device_ahead.count()); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIKernelLaunchEvent>) { | |||
new_host_event(demangle(event.name), 'B'); | |||
new_host_event(pid_str, 's') | |||
.id(event.correlation_id) | |||
.cat("KernelLink") | |||
.scope(pid_str); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIKernelLaunchFinishEvent>) { | |||
new_host_event(demangle(event.name), 'E'); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIKernelExecuteEvent>) { | |||
new_cupti_event(demangle(event.name), 'B', event.stream, event.start) | |||
.arg("execution_time", (event.end - event.start).count()); | |||
new_cupti_event(pid_str, 'f', event.stream, event.end) | |||
.id(event.correlation_id) | |||
.bp('e') | |||
.cat("KernelLink") | |||
.scope(pid_str); | |||
new_cupti_event(demangle(event.name), 'E', event.stream, event.end) | |||
.arg("execution_time", (event.end - event.start).count()); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyLaunchEvent>) { | |||
new_host_event("Memcpy", 'B'); | |||
new_host_event(pid_str, 's') | |||
.id(event.correlation_id) | |||
.cat("CUPTILink") | |||
.scope(pid_str); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyLaunchFinishEvent>) { | |||
new_host_event("Memcpy", 'E'); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyEvent>) { | |||
auto memkind2str = [](uint8_t kind) { | |||
const char* const valid_kinds[] = { | |||
"CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN", | |||
"CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE", | |||
"CUPTI_ACTIVITY_MEMORY_KIND_PINNED", | |||
"CUPTI_ACTIVITY_MEMORY_KIND_DEVICE", | |||
"CUPTI_ACTIVITY_MEMORY_KIND_ARRAY", | |||
"CUPTI_ACTIVITY_MEMORY_KIND_MANAGED", | |||
"CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC", | |||
"CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC"}; | |||
if (kind > (sizeof(valid_kinds) / sizeof(const char*))) { | |||
return "invalid"; | |||
} | |||
return valid_kinds[kind]; | |||
}; | |||
new_cupti_event("Memcpy", 'B', event.stream, event.start) | |||
.arg("bytes", imperative::to_string(event.bytes)) | |||
.arg("src_kind", memkind2str(event.src_kind)) | |||
.arg("dst_kind", memkind2str(event.dst_kind)); | |||
new_cupti_event(pid_str, 'f', event.stream, event.start) | |||
.id(event.correlation_id) | |||
.bp('e') | |||
.cat("CUPTILink") | |||
.scope(pid_str); | |||
new_cupti_event("Memcpy", 'E', event.stream, event.end) | |||
.arg("bytes", imperative::to_string(event.bytes)) | |||
.arg("src_kind", memkind2str(event.src_kind)) | |||
.arg("dst_kind", memkind2str(event.dst_kind)); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIMemsetEvent>) { | |||
new_cupti_event("Memset", 'B', event.stream, event.start) | |||
.arg("value", imperative::to_string(event.value)) | |||
.arg("bytes", imperative::to_string(event.bytes)); | |||
new_cupti_event("Memset", 'E', event.stream, event.start) | |||
.arg("value", imperative::to_string(event.value)) | |||
.arg("bytes", imperative::to_string(event.bytes)); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIRuntimeEvent>) { | |||
new_host_event(event.name, 'B'); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIRuntimeFinishEvent>) { | |||
new_host_event(event.name, 'E'); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIDriverEvent>) { | |||
new_host_event(event.name, 'B'); | |||
new_host_event(pid_str, 's') | |||
.id(event.correlation_id) | |||
.cat("CUPTILink") | |||
.scope(pid_str); | |||
} else if constexpr (std::is_same_v<TEvent, CUPTIDriverFinishEvent>) { | |||
new_host_event(event.name, 'E'); | |||
} | |||
} | |||
@@ -403,7 +485,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
if (thread_dict.count(host)) { | |||
trace_events.new_event() | |||
.name("thread_name") | |||
.pid('M') | |||
.ph('M') | |||
.pid(pid) | |||
.tid(to_tid(host)) | |||
.arg("name", thread_dict.at(host)); | |||
} | |||
@@ -411,7 +494,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
for (auto&& device : devices()) { | |||
trace_events.new_event() | |||
.name("thread_name") | |||
.pid('M') | |||
.ph('M') | |||
.pid(pid) | |||
.tid(to_tid(device)) | |||
.arg("name", device.to_string_logical()); | |||
} | |||
@@ -419,7 +503,7 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||
}; | |||
void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) { | |||
ChromeTimelineEventVisitor visitor; | |||
ChromeTimelineEventVisitor visitor{}; | |||
visitor.process_events(result); | |||
visitor.name_threads(result.thread_dict); | |||
auto trace_events = std::move(visitor.trace_events); | |||
@@ -16,6 +16,7 @@ | |||
#include "../interpreter/stack_manager.h" | |||
#include "../op_trait.h" | |||
#include "megbrain/imperative/cpp_cupti.h" | |||
namespace mgb::imperative::profiler { | |||
@@ -181,6 +182,60 @@ DEF_DUR_EVENT(HostToDevice, { | |||
void* device_ptr; | |||
}); | |||
// cupti events | |||
DEF_EVENT(CUPTITimestamp, { cupti::clock::time_point timestamp; }); | |||
DEF_DUR_EVENT(CUPTIKernelLaunch, { | |||
uint32_t correlation_id; | |||
const char* name; | |||
}); | |||
DEF_EVENT(CUPTIKernelExecute, { | |||
uint32_t correlation_id; | |||
const char* name; | |||
cupti::stream_t stream; | |||
cupti::time_point start; | |||
cupti::time_point end; | |||
}); | |||
DEF_DUR_EVENT(CUPTIMemcpyLaunch, { uint32_t correlation_id; }); | |||
DEF_EVENT(CUPTIMemcpy, { | |||
uint32_t correlation_id; | |||
uint8_t src_kind; | |||
uint8_t dst_kind; | |||
uint64_t bytes; | |||
cupti::stream_t stream; | |||
cupti::time_point start; | |||
cupti::time_point end; | |||
}); | |||
DEF_EVENT(CUPTIMemset, { | |||
uint32_t correlation_id; | |||
uint32_t value; | |||
uint64_t bytes; | |||
cupti::stream_t stream; | |||
cupti::time_point start; | |||
cupti::time_point end; | |||
}); | |||
DEF_EVENT(CUPTIUnknownDevice, {}); | |||
DEF_DUR_EVENT(CUPTIRuntime, { | |||
uint32_t correlation_id; | |||
const char* name; | |||
}); | |||
DEF_DUR_EVENT(CUPTIDriver, { | |||
uint32_t correlation_id; | |||
const char* name; | |||
}); | |||
DEF_EVENT(CUPTIIdentifyStream, { | |||
cupti::stream_t stream; | |||
CompNode device; | |||
}); | |||
#undef DEF_EVENT | |||
#undef DEF_DUR_EVENT | |||
@@ -180,10 +180,13 @@ private: | |||
HostTime m_start_time; | |||
CompNode::UnorderedMap<size_t> m_device_tid_table; | |||
std::unordered_map<std::thread::id, size_t> m_host_tid_table; | |||
std::unordered_map<cupti::stream_t, size_t> m_cupti_tid_table; | |||
CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> | |||
m_device_timeline; | |||
std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack; | |||
std::unordered_map<std::string, int64_t> m_counter_table; | |||
std::optional<std::pair<profiler::HostTime, cupti::time_point>> m_cupti_timestamp = | |||
{}; | |||
protected: | |||
Profiler::Record* current; | |||
@@ -191,6 +194,11 @@ protected: | |||
ProfileTensorState* current_tensor; | |||
protected: | |||
size_t next_tid() { | |||
return m_host_tid_table.size() + m_device_tid_table.size() + | |||
m_cupti_tid_table.size(); | |||
} | |||
profiler::Duration since_start(profiler::HostTime time) { | |||
return time - m_start_time; | |||
} | |||
@@ -229,6 +237,10 @@ protected: | |||
size_t to_tid(CompNode device) { return m_device_tid_table.at(device); } | |||
size_t to_tid(cupti::stream_t cupti_stream) { | |||
return m_cupti_tid_table.at(cupti_stream); | |||
} | |||
SmallVector<std::thread::id> host_threads() { | |||
SmallVector<std::thread::id> host_threads; | |||
for (auto&& [host, _] : m_host_tid_table) { | |||
@@ -254,6 +266,13 @@ protected: | |||
value += delta; | |||
} | |||
profiler::HostTime time_from_cupti(cupti::time_point timestamp) { | |||
mgb_assert(m_cupti_timestamp.has_value()); | |||
return m_cupti_timestamp->first + | |||
std::chrono::duration_cast<profiler::HostTime::duration>( | |||
timestamp - m_cupti_timestamp->second); | |||
} | |||
public: | |||
void process_events(Profiler::bundle_t& bundle) { | |||
m_start_time = bundle.start_at; | |||
@@ -272,7 +291,11 @@ public: | |||
TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, | |||
AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, | |||
ScopeEvent, ScopeFinishEvent, HostToDeviceEvent, | |||
HostToDeviceFinishEvent> | |||
HostToDeviceFinishEvent, CUPTITimestampEvent, CUPTIKernelLaunchEvent, | |||
CUPTIKernelLaunchFinishEvent, CUPTIKernelExecuteEvent, | |||
CUPTIMemcpyLaunchEvent, CUPTIMemcpyLaunchFinishEvent, CUPTIMemcpyEvent, | |||
CUPTIRuntimeEvent, CUPTIRuntimeFinishEvent, CUPTIDriverEvent, | |||
CUPTIDriverFinishEvent, CUPTIMemsetEvent> | |||
converter; | |||
auto for_each_entry = [&](auto&& handler) { | |||
@@ -289,7 +312,9 @@ public: | |||
std::shared_ptr<CompNode::Event> device; | |||
}; | |||
CompNode::UnorderedMap<DeviceStartPair> device_start_table; | |||
std::unordered_map<cupti::stream_t, CompNode> cupti_stream_table; | |||
// record device time | |||
for_each_entry([&](auto&& event) { | |||
using T = std::decay_t<decltype(event)>; | |||
if constexpr (std::is_same_v<T, RecordDeviceEvent>) { | |||
@@ -313,8 +338,7 @@ public: | |||
// register host threads | |||
for_each_entry([&](auto&& event) { | |||
if (!m_host_tid_table.count(current->tid)) { | |||
m_host_tid_table[current->tid] = { | |||
m_device_tid_table.size() + m_host_tid_table.size()}; | |||
m_host_tid_table[current->tid] = next_tid(); | |||
} | |||
}); | |||
@@ -340,14 +364,39 @@ public: | |||
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | |||
auto& tensor = m_tensors[event.tensor_id]; | |||
if (!m_device_tid_table.count(event.device)) { | |||
m_device_tid_table[event.device] = { | |||
m_device_tid_table.size() + m_host_tid_table.size()}; | |||
m_device_tid_table[event.device] = next_tid(); | |||
} | |||
tensor.device = event.device; | |||
tensor.layout = event.layout; | |||
} | |||
}); | |||
for_each_entry([&](auto&& event) { | |||
using T = std::decay_t<decltype(event)>; | |||
if constexpr (std::is_same_v<T, CUPTIIdentifyStreamEvent>) { | |||
if (!m_cupti_tid_table.count(event.stream)) { | |||
m_cupti_tid_table[event.stream] = | |||
m_device_tid_table.at(event.device); | |||
} | |||
} | |||
}); | |||
// record cupti streams | |||
for_each_entry([&](auto&& event) { | |||
using T = std::decay_t<decltype(event)>; | |||
if constexpr ( | |||
std::is_same_v<T, CUPTIKernelExecuteEvent> || | |||
std::is_same_v<T, CUPTIMemcpyEvent> || | |||
std::is_same_v<T, CUPTIMemsetEvent>) { | |||
if (!m_cupti_tid_table.count(event.stream)) { | |||
m_cupti_tid_table[event.stream] = next_tid(); | |||
} | |||
} else if constexpr (std::is_same_v<T, CUPTITimestampEvent>) { | |||
mgb_assert(!m_cupti_timestamp.has_value()); | |||
m_cupti_timestamp.emplace(current->time, event.timestamp); | |||
} | |||
}); | |||
// replay execution | |||
using namespace std::placeholders; | |||
for_each_entry([&](auto&& event) { | |||
@@ -0,0 +1,25 @@ | |||
#include "megbrain/imperative/utils/platform.h" | |||
#ifdef __GNUG__ | |||
#include <cxxabi.h> | |||
#include <cstdlib> | |||
#include <memory> | |||
#endif | |||
using namespace mgb; | |||
using namespace imperative; | |||
/* | |||
* demangle typeid, see | |||
* http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname | |||
*/ | |||
std::string mgb::imperative::demangle(std::string mangled) { | |||
#ifdef __GNUG__ | |||
int status = -1; | |||
std::unique_ptr<char, void (*)(void*)> res{ | |||
abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status), std::free}; | |||
return (status == 0) ? res.get() : mangled; | |||
#else | |||
return mangled; | |||
#endif | |||
} |
@@ -0,0 +1,86 @@ | |||
#pragma once | |||
#include <chrono> | |||
#include <ctime> | |||
#include "megbrain/common.h" | |||
#include "megbrain/imperative/utils/to_string.h" | |||
namespace mgb::imperative::cupti { | |||
struct clock { | |||
typedef std::chrono::nanoseconds duration; | |||
typedef duration::rep rep; | |||
typedef duration::period period; | |||
typedef std::chrono::time_point<clock> time_point; | |||
static const bool is_steady = false; | |||
static time_point now() /* noexcept */; | |||
}; | |||
using time_point = clock::time_point; | |||
using duration = clock::duration; | |||
struct device_t { | |||
uint32_t device_id; | |||
bool operator==(const device_t& rhs) const { return device_id == rhs.device_id; } | |||
}; | |||
struct context_t : device_t { | |||
uint32_t context_id; | |||
bool operator==(const context_t& rhs) const { | |||
return device_t::operator==(rhs) && context_id == rhs.context_id; | |||
} | |||
}; | |||
struct stream_t : context_t { | |||
uint32_t stream_id; | |||
bool operator==(const stream_t& rhs) const { | |||
return context_t::operator==(rhs) && stream_id == rhs.stream_id; | |||
} | |||
}; | |||
bool available(); | |||
void enable(); | |||
void disable(); | |||
void flush(); | |||
bool enabled(); | |||
template <typename TActivity> | |||
struct activity { | |||
private: | |||
TActivity* m_ptr; | |||
public: | |||
activity(void* ptr) : m_ptr((TActivity*)ptr) {} | |||
time_point start() const { return time_point(duration(m_ptr->start)); } | |||
time_point end() const { return time_point(duration(m_ptr->end)); } | |||
device_t device() const { return {m_ptr->deviceId}; } | |||
context_t context() const { return {device(), m_ptr->contextId}; } | |||
stream_t stream() const { return {context(), m_ptr->streamId}; } | |||
TActivity* operator->() const { return m_ptr; } | |||
}; | |||
} // namespace mgb::imperative::cupti | |||
template <> | |||
class std::hash<mgb::imperative::cupti::stream_t> { | |||
public: | |||
size_t operator()(const mgb::imperative::cupti::stream_t& value) const { | |||
return value.stream_id; | |||
} | |||
}; |
@@ -194,16 +194,9 @@ public: | |||
static bool is_profiling() { return sm_profiling; } | |||
static void start_profile() { | |||
mgb_assert(!sm_profiling); | |||
sm_start_at = Timer::record_host(); | |||
sm_profiling = true; | |||
} | |||
static void start_profile(); | |||
static void stop_profile() { | |||
mgb_assert(sm_profiling); | |||
sm_profiling = false; | |||
} | |||
static void stop_profile(); | |||
static thread_dict_t get_thread_dict(); | |||
@@ -0,0 +1,9 @@ | |||
#pragma once | |||
#include <string> | |||
namespace mgb::imperative { | |||
std::string demangle(std::string mangled); | |||
} |
@@ -37,6 +37,10 @@ if(MGE_WITH_CUDA) | |||
list(APPEND LINK_LIBS cudart) | |||
endif() | |||
if(MGE_WITH_CUPTI) | |||
list(APPEND LINK_LIBS libcupti) | |||
endif() | |||
if(MGE_WITH_DISTRIBUTED) | |||
list(APPEND LINK_LIBS megray) | |||
endif() | |||
@@ -61,11 +61,11 @@ echo "Build with ${SDK_NAME}" | |||
if [ $SDK_NAME == "cu101" ];then | |||
CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||
BUILD_GCC8="ON" | |||
REQUIR_CUDA_VERSION="10010" | |||
REQUIR_CUDNN_VERSION="7.6.3" | |||
REQUIR_TENSORRT_VERSION="6.0.1.5" | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||
BUILD_GCC8="ON" | |||
REQUIR_CUDA_VERSION="10010" | |||
REQUIR_CUDNN_VERSION="7.6.3" | |||
REQUIR_TENSORRT_VERSION="6.0.1.5" | |||
REQUIR_CUBLAS_VERSION="10.2.1.243" | |||
elif [ $SDK_NAME == "cu102_JetsonNano" ];then | |||
@@ -87,6 +87,12 @@ elif [ $SDK_NAME == "cu102_JetsonNano" ];then | |||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
if [ ${machine} == "aarch64" ];then | |||
CUDA_COPY_LIB_LIST="\ | |||
${CUDA_LIB_DIR}/libcupti.so.10.2:\ | |||
${CUDA_COPY_LIB_LIST}" | |||
fi | |||
EXTRA_CMAKE_FLAG="-DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_53,code=sm_53\" " | |||
elif [ $SDK_NAME == "cu111" ];then | |||
@@ -118,6 +124,12 @@ elif [ $SDK_NAME == "cu111" ];then | |||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
if [ ${machine} == "aarch64" ];then | |||
CUDA_COPY_LIB_LIST="\ | |||
${CUDA_LIB_DIR}/libcupti.so.11.1:\ | |||
${CUDA_COPY_LIB_LIST}" | |||
fi | |||
if [ ${IN_CI} = "true" ] && [ ${machine} == "aarch64" ]; then | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_75,code=sm_75\" " | |||
else | |||
@@ -152,9 +164,9 @@ elif [ $SDK_NAME == "cu112" ];then | |||
-gencode arch=compute_86,code=sm_86 \ | |||
-gencode arch=compute_86,code=compute_86\" " | |||
REQUIR_CUDA_VERSION="11020" | |||
REQUIR_CUDNN_VERSION="8.0.4" | |||
REQUIR_TENSORRT_VERSION="7.2.2.3" | |||
REQUIR_CUDA_VERSION="11020" | |||
REQUIR_CUDNN_VERSION="8.0.4" | |||
REQUIR_TENSORRT_VERSION="7.2.2.3" | |||
REQUIR_CUBLAS_VERSION="11.3.1.68" | |||
elif [ $SDK_NAME == "cpu" ];then | |||
@@ -35,6 +35,7 @@ | |||
#cmakedefine01 MGB_ENABLE_FBS_SERIALIZATION | |||
#cmakedefine01 MGB_IS_DEV | |||
#cmakedefine01 MGB_CUSTOM_OP | |||
#cmakedefine01 MGB_CUPTI | |||
// DNN related flags | |||
// Platform macro's | |||
#cmakedefine01 MEGDNN_WITH_CUDA | |||