@@ -29,6 +29,7 @@ endif() | |||||
include(GNUInstallDirs) | include(GNUInstallDirs) | ||||
include(CheckCXXCompilerFlag) | include(CheckCXXCompilerFlag) | ||||
include(CheckIPOSupported) | include(CheckIPOSupported) | ||||
include(CMakeDependentOption) | |||||
check_cxx_compiler_flag(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS) | check_cxx_compiler_flag(-Wclass-memaccess CXX_SUPPORT_WCLASS_MEMACCESS) | ||||
@@ -97,6 +98,12 @@ option(MGE_BUILD_WITH_ASAN "Enable build with ASAN, need compiler support" OFF) | |||||
option(MGE_WITH_CUSTOM_OP "Build with Custom op" OFF) | option(MGE_WITH_CUSTOM_OP "Build with Custom op" OFF) | ||||
option(MGE_SYNC_THIRD_PARTY "help sync third_party submodule" OFF) | option(MGE_SYNC_THIRD_PARTY "help sync third_party submodule" OFF) | ||||
# TODO: add windows support | |||||
cmake_dependent_option(MGE_WITH_CUPTI "Build with CUPTI" ON | |||||
"MGE_WITH_CUDA;MGE_BUILD_IMPERATIVE_RT;NOT MSVC;NOT WIN32" OFF) | |||||
set(MGB_CUPTI ${MGE_WITH_CUPTI}) | |||||
if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
# FIXME: static link Windows vc runtime with some version from Visual Studio have some | # FIXME: static link Windows vc runtime with some version from Visual Studio have some | ||||
# runtime issue at some call PATH, for example: _imperative_rt.pyd --> | # runtime issue at some call PATH, for example: _imperative_rt.pyd --> | ||||
@@ -686,6 +693,10 @@ if(MGB_WITH_FLATBUFFERS) | |||||
include(cmake/flatbuffers.cmake) | include(cmake/flatbuffers.cmake) | ||||
endif() | endif() | ||||
if(MGE_WITH_CUPTI) | |||||
include(cmake/cupti.cmake) | |||||
endif() | |||||
if(MGE_WITH_CUDA) | if(MGE_WITH_CUDA) | ||||
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) | include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) | ||||
foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) | foreach(path ${CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES}) | ||||
@@ -6,7 +6,7 @@ endif() | |||||
if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") | if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") | ||||
set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | ||||
endif() | endif() | ||||
message("CUDNN ROOT: " ${CUDNN_ROOT_DIR}) | |||||
message(STATUS "CUDNN ROOT: ${CUDNN_ROOT_DIR}") | |||||
if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) | if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) | ||||
find_library( | find_library( | ||||
CUDNN_LIBRARY | CUDNN_LIBRARY | ||||
@@ -0,0 +1,85 @@ | |||||
if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_ROOT_DIR}" STREQUAL "") | |||||
set(CUDA_ROOT_DIR $ENV{CUDA_ROOT_DIR}) | |||||
endif() | |||||
if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_PATH}" STREQUAL "") | |||||
set(CUDA_ROOT_DIR $ENV{CUDA_PATH}) | |||||
endif() | |||||
if("${CUDA_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDA_BIN_PATH}" STREQUAL "") | |||||
set(CUDA_ROOT_DIR $ENV{CUDA_BIN_PATH}) | |||||
endif() | |||||
if("${CUDA_ROOT_DIR}" STREQUAL "") | |||||
message( | |||||
FATAL_ERROR | |||||
"Can not find CUDA, please export cuda sdk path to CUDA_ROOT_DIR or CUDA_PATH or CUDA_BIN_PATH" | |||||
) | |||||
endif() | |||||
# TODO: find_library(CUDA_ROOT_DIR) in cmake/cuda.cmake | |||||
set(MGE_CUPTI_USE_STATIC ${MGE_CUDA_USE_STATIC}) | |||||
# relates https://stackoverflow.com/questions/67485114 | |||||
if(${MGE_CUDA_USE_STATIC} AND ${CXX_SUPPORT_GOLD}) | |||||
message(WARNING "static linking CuPTI with gold may break exception handling,\ | |||||
use shared one instead") | |||||
set(MGE_CUPTI_USE_STATIC OFF) | |||||
endif() | |||||
if(MGE_CUPTI_USE_STATIC) | |||||
find_library( | |||||
CUPTI_LIBRARY | |||||
NAMES libcupti_static.a | |||||
HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||||
PATH_SUFFIXES lib lib64 | |||||
DOC "CuPTI library.") | |||||
if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND") | |||||
message(WARNING "Can not find static CuPTI Library, use shared one instead") | |||||
set(MGE_CUPTI_USE_STATIC OFF) | |||||
endif() | |||||
endif() | |||||
if(NOT ${MGE_CUPTI_USE_STATIC}) | |||||
find_library( | |||||
CUPTI_LIBRARY | |||||
NAMES libcupti.so | |||||
HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||||
PATH_SUFFIXES lib lib64 | |||||
DOC "CuPTI library.") | |||||
set(CUPTI_LIBRARY_TYPE SHARED) | |||||
else() | |||||
set(CUPTI_LIBRARY_TYPE STATIC) | |||||
endif() | |||||
if("${CUPTI_LIBRARY}" STREQUAL "CUPTI_LIBRARY-NOTFOUND") | |||||
message(FATAL_ERROR "Can not find CuPTI Library") | |||||
endif() | |||||
find_path( | |||||
CUPTI_INCLUDE_DIR | |||||
NAMES cupti.h | |||||
HINTS ${CUDA_ROOT_DIR} ${CUDA_ROOT_DIR}/extras/CUPTI | |||||
PATH_SUFFIXES include | |||||
DOC "Path to CuPTI include directory.") | |||||
if(CUPTI_INCLUDE_DIR STREQUAL "CUPTI_INCLUDE_DIR-NOTFOUND") | |||||
message(FATAL_ERROR "Can not find CuPTI INCLUDE") | |||||
endif() | |||||
if(EXISTS ${CUPTI_INCLUDE_DIR}/cupti_version.h) | |||||
file(READ ${CUPTI_INCLUDE_DIR}/cupti_version.h CUPTI_VERSION_FILE_CONTENTS) | |||||
else() | |||||
file(READ ${CUPTI_INCLUDE_DIR}/cupti.h CUPTI_VERSION_FILE_CONTENTS) | |||||
endif() | |||||
string(REGEX MATCH "define CUPTI_API_VERSION * +([0-9]+)" CUPTI_API_VERSION | |||||
"${CUPTI_VERSION_FILE_CONTENTS}") | |||||
string(REGEX REPLACE "define CUPTI_API_VERSION * +([0-9]+)" "\\1" CUPTI_API_VERSION | |||||
"${CUPTI_API_VERSION}") | |||||
add_library(libcupti ${CUPTI_LIBRARY_TYPE} IMPORTED) | |||||
set_target_properties( | |||||
libcupti PROPERTIES IMPORTED_LOCATION ${CUPTI_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES | |||||
${CUPTI_INCLUDE_DIR}) | |||||
message(STATUS "Found CuPTI: ${CUPTI_LIBRARY} (found version: ${CUPTI_API_VERSION})") |
@@ -36,7 +36,7 @@ else() | |||||
PATH_SUFFIXES lib lib64 | PATH_SUFFIXES lib lib64 | ||||
DOC "TRT plugin library.") | DOC "TRT plugin library.") | ||||
endif() | endif() | ||||
message("TRT_LIBRARY" ${TRT_LIBRARY}) | |||||
message(STATUS "TRT_LIBRARY: ${TRT_LIBRARY}") | |||||
if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND") | if(TRT_LIBRARY STREQUAL "TRT_LIBRARY-NOTFOUND") | ||||
message( | message( | ||||
FATAL_ERROR | FATAL_ERROR | ||||
@@ -51,6 +51,10 @@ if(ANDROID) | |||||
target_link_libraries(${MODULE_NAME} PRIVATE ${PYTHON_LIBRARIES}) | target_link_libraries(${MODULE_NAME} PRIVATE ${PYTHON_LIBRARIES}) | ||||
endif() | endif() | ||||
if(MGE_WITH_CUPTI) | |||||
target_link_libraries(${MODULE_NAME} PRIVATE libcupti) | |||||
endif() | |||||
add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 | add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/range-v3 | ||||
${PROJECT_BINARY_DIR}/third_party/range-v3) | ${PROJECT_BINARY_DIR}/third_party/range-v3) | ||||
target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | target_link_libraries(${MODULE_NAME} PRIVATE range-v3) | ||||
@@ -16,6 +16,10 @@ from weakref import WeakSet | |||||
from .. import _atexit | from .. import _atexit | ||||
from ..core._imperative_rt.core2 import ( | from ..core._imperative_rt.core2 import ( | ||||
cupti_available, | |||||
disable_cupti, | |||||
enable_cupti, | |||||
full_sync, | |||||
pop_scope, | pop_scope, | ||||
push_scope, | push_scope, | ||||
start_profile, | start_profile, | ||||
@@ -50,13 +54,18 @@ class Profiler(ContextDecorator): | |||||
with profiler: | with profiler: | ||||
# your code here | # your code here | ||||
# Then open the profile file in chrome timeline window | # Then open the profile file in chrome timeline window | ||||
""" | """ | ||||
CHROME_TIMELINE = "chrome_timeline.json" | CHROME_TIMELINE = "chrome_timeline.json" | ||||
valid_options = {"sample_rate": 0, "profile_device": 1, "num_tensor_watch": 10} | |||||
valid_options = { | |||||
"sample_rate": 0, | |||||
"profile_device": 1, | |||||
"num_tensor_watch": 10, | |||||
"enable_cupti": 0, | |||||
} | |||||
valid_formats = {"chrome_timeline.json", "memory_flow.svg"} | valid_formats = {"chrome_timeline.json", "memory_flow.svg"} | ||||
def __init__( | def __init__( | ||||
@@ -83,6 +92,11 @@ class Profiler(ContextDecorator): | |||||
self._options[opt] = int(kwargs.pop(opt, optval)) | self._options[opt] = int(kwargs.pop(opt, optval)) | ||||
self._pid = "<PID>" | self._pid = "<PID>" | ||||
self._dump_callback = None | self._dump_callback = None | ||||
if self._options.get("enable_cupti", 0): | |||||
if cupti_available(): | |||||
enable_cupti() | |||||
else: | |||||
get_logger().warning("CuPTI unavailable") | |||||
@property | @property | ||||
def path(self): | def path(self): | ||||
@@ -116,7 +130,7 @@ class Profiler(ContextDecorator): | |||||
assert _running_profiler is self | assert _running_profiler is self | ||||
_running_profiler = None | _running_profiler = None | ||||
sync() | |||||
full_sync() | |||||
self._dump_callback = stop_profile() | self._dump_callback = stop_profile() | ||||
self._pid = os.getpid() | self._pid = os.getpid() | ||||
_living_profilers.add(self) | _living_profilers.add(self) | ||||
@@ -160,6 +174,9 @@ class Profiler(ContextDecorator): | |||||
return func | return func | ||||
def __del__(self): | def __del__(self): | ||||
if self._options.get("enable_cupti", 0): | |||||
if cupti_available(): | |||||
disable_cupti() | |||||
self.dump() | self.dump() | ||||
@@ -11,6 +11,7 @@ | |||||
#include "megbrain/common.h" | #include "megbrain/common.h" | ||||
#include "megbrain/dtype.h" | #include "megbrain/dtype.h" | ||||
#include "megbrain/imperative/cpp_cupti.h" | |||||
#include "megbrain/imperative/ops/autogen.h" | #include "megbrain/imperative/ops/autogen.h" | ||||
#include "megbrain/imperative/ops/backward_graph.h" | #include "megbrain/imperative/ops/backward_graph.h" | ||||
#include "megbrain/imperative/ops/utility.h" | #include "megbrain/imperative/ops/utility.h" | ||||
@@ -982,6 +983,7 @@ void init_tensor(py::module m) { | |||||
m.def("stop_profile", [channel]() -> std::function<void(std::string, std::string)> { | m.def("stop_profile", [channel]() -> std::function<void(std::string, std::string)> { | ||||
channel->stop_profile(); | channel->stop_profile(); | ||||
channel->sync(); | channel->sync(); | ||||
CompNode::sync_all(); | |||||
imperative::Profiler::stop_profile(); | imperative::Profiler::stop_profile(); | ||||
auto results = std::make_shared<imperative::Profiler::bundle_t>( | auto results = std::make_shared<imperative::Profiler::bundle_t>( | ||||
imperative::Profiler::collect()); | imperative::Profiler::collect()); | ||||
@@ -990,6 +992,9 @@ void init_tensor(py::module m) { | |||||
results = nullptr; | results = nullptr; | ||||
}; | }; | ||||
}); | }); | ||||
m.def("enable_cupti", &cupti::enable); | |||||
m.def("disable_cupti", &cupti::disable); | |||||
m.def("cupti_available", &cupti::available); | |||||
m.def("sync", [channel]() { | m.def("sync", [channel]() { | ||||
if (channel->check_available()) { | if (channel->check_available()) { | ||||
channel->sync(); | channel->sync(); | ||||
@@ -0,0 +1,273 @@ | |||||
#include "megbrain/imperative/cpp_cupti.h" | |||||
#include <cinttypes> | |||||
#include <cstddef> | |||||
#include <cstdlib> | |||||
#include "megbrain/exception.h" | |||||
#include "megbrain/imperative/profiler.h" | |||||
#include "megbrain/imperative/utils/platform.h" | |||||
#include "./profiler/events.h" | |||||
#if MGB_CUPTI | |||||
#include "cupti.h" | |||||
#define CUPTI_CALL(call) \ | |||||
do { \ | |||||
CUptiResult _status = call; \ | |||||
if (_status != CUPTI_SUCCESS) { \ | |||||
const char* errstr; \ | |||||
cuptiGetResultString(_status, &errstr); \ | |||||
mgb_assert(_status == CUPTI_SUCCESS, "cupti error: %s", errstr); \ | |||||
} \ | |||||
} while (0) | |||||
#endif | |||||
namespace mgb::imperative::cupti { | |||||
#if MGB_CUPTI | |||||
namespace { | |||||
CUpti_SubscriberHandle cuptiSubscriber; | |||||
void cuptiSubscriberCallback( | |||||
void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id, | |||||
const void* cb_info) { | |||||
using namespace profiler; | |||||
switch (domain) { | |||||
case CUPTI_CB_DOMAIN_DRIVER_API: { | |||||
auto cb_data = (const CUpti_CallbackData*)cb_info; | |||||
switch (cb_id) { | |||||
case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: { | |||||
if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||||
MGB_RECORD_EVENT( | |||||
CUPTIKernelLaunchEvent, cb_data->correlationId, | |||||
cb_data->symbolName); | |||||
} else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||||
MGB_RECORD_EVENT( | |||||
CUPTIKernelLaunchFinishEvent, cb_data->correlationId, | |||||
cb_data->symbolName); | |||||
} | |||||
break; | |||||
} | |||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA: { | |||||
} | |||||
case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: { | |||||
if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||||
MGB_RECORD_EVENT( | |||||
CUPTIMemcpyLaunchEvent, cb_data->correlationId); | |||||
} else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||||
MGB_RECORD_EVENT( | |||||
CUPTIMemcpyLaunchFinishEvent, cb_data->correlationId); | |||||
} | |||||
break; | |||||
} | |||||
default: { | |||||
if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||||
MGB_RECORD_EVENT( | |||||
CUPTIDriverEvent, cb_data->correlationId, | |||||
cb_data->functionName); | |||||
} else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||||
MGB_RECORD_EVENT( | |||||
CUPTIDriverFinishEvent, cb_data->correlationId, | |||||
cb_data->functionName); | |||||
} | |||||
} | |||||
} | |||||
break; | |||||
} | |||||
case CUPTI_CB_DOMAIN_RUNTIME_API: { | |||||
auto cb_data = (const CUpti_CallbackData*)cb_info; | |||||
if (cb_data->callbackSite == CUPTI_API_ENTER) { | |||||
MGB_RECORD_EVENT( | |||||
CUPTIRuntimeEvent, cb_data->correlationId, | |||||
cb_data->functionName); | |||||
} else if (cb_data->callbackSite == CUPTI_API_EXIT) { | |||||
MGB_RECORD_EVENT( | |||||
CUPTIRuntimeFinishEvent, cb_data->correlationId, | |||||
cb_data->functionName); | |||||
} | |||||
break; | |||||
} | |||||
} | |||||
} | |||||
void handleActivity(CUpti_Activity* record) { | |||||
using namespace std::chrono_literals; | |||||
auto delta = 16ns; | |||||
switch (record->kind) { | |||||
case CUPTI_ACTIVITY_KIND_KERNEL: | |||||
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { | |||||
auto kernel = cupti::activity<CUpti_ActivityKernel4>(record); | |||||
MGB_RECORD_EVENT( | |||||
profiler::CUPTIKernelExecuteEvent, kernel->correlationId, | |||||
kernel->name, kernel.stream(), kernel.start(), | |||||
kernel.end() - delta); | |||||
break; | |||||
} | |||||
case CUPTI_ACTIVITY_KIND_MEMCPY: { | |||||
auto memcpy = cupti::activity<CUpti_ActivityMemcpy>(record); | |||||
MGB_RECORD_EVENT( | |||||
profiler::CUPTIMemcpyEvent, memcpy->correlationId, memcpy->srcKind, | |||||
memcpy->dstKind, memcpy->bytes, memcpy.stream(), memcpy.start(), | |||||
memcpy.end()); | |||||
break; | |||||
} | |||||
case CUPTI_ACTIVITY_KIND_MEMSET: { | |||||
auto memset = cupti::activity<CUpti_ActivityMemset>(record); | |||||
MGB_RECORD_EVENT( | |||||
profiler::CUPTIMemsetEvent, memset->correlationId, memset->value, | |||||
memset->bytes, memset.stream(), memset.start(), | |||||
memset.end() - delta); | |||||
break; | |||||
} | |||||
default: | |||||
break; | |||||
} | |||||
} | |||||
using activity_buffer_t = | |||||
std::aligned_storage_t<8 * 1024 * 1024, ACTIVITY_RECORD_ALIGNMENT>; | |||||
void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { | |||||
*buffer = reinterpret_cast<uint8_t*>(new activity_buffer_t()); | |||||
*size = sizeof(activity_buffer_t); | |||||
*maxNumRecords = 0; | |||||
} | |||||
void bufferCompleted( | |||||
CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size, | |||||
size_t validSize) { | |||||
CUptiResult status; | |||||
CUpti_Activity* record = NULL; | |||||
if (validSize > 0) { | |||||
do { | |||||
status = cuptiActivityGetNextRecord(buffer, validSize, &record); | |||||
if (status == CUPTI_SUCCESS) { | |||||
handleActivity(record); | |||||
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) | |||||
break; | |||||
else { | |||||
CUPTI_CALL(status); | |||||
} | |||||
} while (1); | |||||
size_t dropped; | |||||
CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); | |||||
mgb_assert(dropped == 0, "%zu records dropped", dropped); | |||||
} | |||||
delete reinterpret_cast<activity_buffer_t*>(buffer); | |||||
} | |||||
static bool initialized = false; | |||||
} // namespace | |||||
bool available() { | |||||
uint32_t compiletime_version = (CUPTI_API_VERSION); | |||||
uint32_t runtime_version; | |||||
CUPTI_CALL(cuptiGetVersion(&runtime_version)); | |||||
if (compiletime_version != runtime_version) { | |||||
static std::once_flag once; | |||||
std::call_once(once, [&] { | |||||
mgb_log_warn( | |||||
"CuPTI version %d mismatch against compiletime version %d. " | |||||
"This may caused by user config LD_LIBRARY_PATH" | |||||
"at unix-like env or config PATH at Windows env", | |||||
(int)compiletime_version, (int)runtime_version); | |||||
}); | |||||
return false; | |||||
} | |||||
return true; | |||||
} | |||||
void enable() { | |||||
// not thread safe | |||||
mgb_assert(!initialized, "cupti already initialized"); | |||||
// callback | |||||
CUPTI_CALL(cuptiSubscribe( | |||||
&cuptiSubscriber, (CUpti_CallbackFunc)cuptiSubscriberCallback, | |||||
(void*)nullptr)); | |||||
CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_DRIVER_API)); | |||||
CUPTI_CALL(cuptiEnableDomain(1, cuptiSubscriber, CUPTI_CB_DOMAIN_RUNTIME_API)); | |||||
// activity | |||||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); | |||||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); | |||||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); | |||||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); | |||||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); | |||||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); | |||||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); | |||||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); | |||||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)); | |||||
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); | |||||
CUPTI_CALL(cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted)); | |||||
initialized = true; | |||||
} | |||||
void disable() { | |||||
mgb_assert(initialized, "cupti not initialized yet"); | |||||
flush(); | |||||
CUPTI_CALL(cuptiFinalize()); | |||||
initialized = false; | |||||
} | |||||
void flush() { | |||||
if (initialized) { | |||||
CUPTI_CALL(cuptiActivityFlushAll(1)); | |||||
} | |||||
} | |||||
bool enabled() { | |||||
return initialized; | |||||
} | |||||
time_point clock::now() { | |||||
uint64_t timestamp; | |||||
CUPTI_CALL(cuptiGetTimestamp(×tamp)); | |||||
using namespace std::chrono; | |||||
// overflow? | |||||
return time_point(duration((int64_t)timestamp)); | |||||
} | |||||
#else | |||||
class CuPTIUnavailableError : public MegBrainError { | |||||
public: | |||||
CuPTIUnavailableError() | |||||
: MegBrainError( | |||||
#if MGB_CUDA | |||||
"CuPTI disabled at compile time" | |||||
#else | |||||
"CuPTI unsupported on non cuda platform" | |||||
#endif | |||||
) { | |||||
} | |||||
}; | |||||
bool available() { | |||||
return false; | |||||
} | |||||
void enable() { | |||||
throw CuPTIUnavailableError(); | |||||
} | |||||
void disable() { | |||||
throw CuPTIUnavailableError(); | |||||
} | |||||
void flush() {} | |||||
bool enabled() { | |||||
return false; | |||||
} | |||||
time_point clock::now() { | |||||
throw CuPTIUnavailableError(); | |||||
} | |||||
#endif | |||||
} // namespace mgb::imperative::cupti |
@@ -12,7 +12,9 @@ | |||||
#include "megbrain/imperative/profiler.h" | #include "megbrain/imperative/profiler.h" | ||||
#include <chrono> | #include <chrono> | ||||
#include <unordered_map> | |||||
#include "megbrain/imperative/cpp_cupti.h" | |||||
#include "megbrain/imperative/ops/opr_attr.h" | #include "megbrain/imperative/ops/opr_attr.h" | ||||
#include "megbrain/imperative/physical_tensor.h" | #include "megbrain/imperative/physical_tensor.h" | ||||
@@ -48,6 +50,21 @@ bool Profiler::sm_profiling = false; | |||||
thread_local Profiler* Profiler::tm_profiler = nullptr; | thread_local Profiler* Profiler::tm_profiler = nullptr; | ||||
std::atomic_size_t Profiler::sm_preferred_capacity; | std::atomic_size_t Profiler::sm_preferred_capacity; | ||||
void Profiler::start_profile() { | |||||
mgb_assert(!sm_profiling); | |||||
sm_start_at = Timer::record_host(); | |||||
sm_profiling = true; | |||||
if (cupti::enabled()) { | |||||
MGB_RECORD_EVENT(profiler::CUPTITimestampEvent, cupti::clock::now()); | |||||
} | |||||
} | |||||
void Profiler::stop_profile() { | |||||
mgb_assert(sm_profiling); | |||||
cupti::flush(); | |||||
sm_profiling = false; | |||||
} | |||||
auto Profiler::get_thread_dict() -> thread_dict_t { | auto Profiler::get_thread_dict() -> thread_dict_t { | ||||
thread_dict_t thread_dict; | thread_dict_t thread_dict; | ||||
for (auto&& [tid, profiler] : sm_profilers) { | for (auto&& [tid, profiler] : sm_profilers) { | ||||
@@ -19,6 +19,7 @@ | |||||
#include "nlohmann/json.hpp" | #include "nlohmann/json.hpp" | ||||
#include "megbrain/imperative/utils/platform.h" | |||||
#include "megbrain/utils/debug.h" | #include "megbrain/utils/debug.h" | ||||
#include "./formats.h" | #include "./formats.h" | ||||
@@ -198,6 +199,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
decltype(getpid()) pid = getpid(); | decltype(getpid()) pid = getpid(); | ||||
std::string pid_str = std::to_string(pid); | std::string pid_str = std::to_string(pid); | ||||
ChromeTimelineEventVisitor() {} | |||||
ChromeTraceEvent& new_event( | ChromeTraceEvent& new_event( | ||||
std::string name, char ph, size_t tid, profiler::HostTime time) { | std::string name, char ph, size_t tid, profiler::HostTime time) { | ||||
return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts( | return trace_events.new_event().name(name).ph(ph).pid(pid).tid(tid).ts( | ||||
@@ -213,8 +216,13 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
.ts(since_start(current->time)); | .ts(since_start(current->time)); | ||||
} | } | ||||
ChromeTraceEvent& new_cupti_event( | |||||
std::string name, char ph, cupti::stream_t stream, | |||||
cupti::time_point timestamp) { | |||||
return new_event(name, ph, to_tid(stream), time_from_cupti(timestamp)); | |||||
} | |||||
ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | ChromeTraceEvent& new_device_event(std::string name, char ph, CompNode device) { | ||||
using namespace std::literals::chrono_literals; | |||||
auto time = since_start(to_device_time(current->time, device)); | auto time = since_start(to_device_time(current->time, device)); | ||||
return trace_events.new_event() | return trace_events.new_event() | ||||
.name(name) | .name(name) | ||||
@@ -391,6 +399,80 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>( | auto device_ahead = std::chrono::duration_cast<std::chrono::milliseconds>( | ||||
current_device_time - current_host_time); | current_device_time - current_host_time); | ||||
new_host_event("device_ahead_ms", 'C').arg("value", device_ahead.count()); | new_host_event("device_ahead_ms", 'C').arg("value", device_ahead.count()); | ||||
} else if constexpr (std::is_same_v<TEvent, CUPTIKernelLaunchEvent>) { | |||||
new_host_event(demangle(event.name), 'B'); | |||||
new_host_event(pid_str, 's') | |||||
.id(event.correlation_id) | |||||
.cat("KernelLink") | |||||
.scope(pid_str); | |||||
} else if constexpr (std::is_same_v<TEvent, CUPTIKernelLaunchFinishEvent>) { | |||||
new_host_event(demangle(event.name), 'E'); | |||||
} else if constexpr (std::is_same_v<TEvent, CUPTIKernelExecuteEvent>) { | |||||
new_cupti_event(demangle(event.name), 'B', event.stream, event.start) | |||||
.arg("execution_time", (event.end - event.start).count()); | |||||
new_cupti_event(pid_str, 'f', event.stream, event.end) | |||||
.id(event.correlation_id) | |||||
.bp('e') | |||||
.cat("KernelLink") | |||||
.scope(pid_str); | |||||
new_cupti_event(demangle(event.name), 'E', event.stream, event.end) | |||||
.arg("execution_time", (event.end - event.start).count()); | |||||
} else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyLaunchEvent>) { | |||||
new_host_event("Memcpy", 'B'); | |||||
new_host_event(pid_str, 's') | |||||
.id(event.correlation_id) | |||||
.cat("CUPTILink") | |||||
.scope(pid_str); | |||||
} else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyLaunchFinishEvent>) { | |||||
new_host_event("Memcpy", 'E'); | |||||
} else if constexpr (std::is_same_v<TEvent, CUPTIMemcpyEvent>) { | |||||
auto memkind2str = [](uint8_t kind) { | |||||
const char* const valid_kinds[] = { | |||||
"CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN", | |||||
"CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE", | |||||
"CUPTI_ACTIVITY_MEMORY_KIND_PINNED", | |||||
"CUPTI_ACTIVITY_MEMORY_KIND_DEVICE", | |||||
"CUPTI_ACTIVITY_MEMORY_KIND_ARRAY", | |||||
"CUPTI_ACTIVITY_MEMORY_KIND_MANAGED", | |||||
"CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC", | |||||
"CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC"}; | |||||
if (kind > (sizeof(valid_kinds) / sizeof(const char*))) { | |||||
return "invalid"; | |||||
} | |||||
return valid_kinds[kind]; | |||||
}; | |||||
new_cupti_event("Memcpy", 'B', event.stream, event.start) | |||||
.arg("bytes", imperative::to_string(event.bytes)) | |||||
.arg("src_kind", memkind2str(event.src_kind)) | |||||
.arg("dst_kind", memkind2str(event.dst_kind)); | |||||
new_cupti_event(pid_str, 'f', event.stream, event.start) | |||||
.id(event.correlation_id) | |||||
.bp('e') | |||||
.cat("CUPTILink") | |||||
.scope(pid_str); | |||||
new_cupti_event("Memcpy", 'E', event.stream, event.end) | |||||
.arg("bytes", imperative::to_string(event.bytes)) | |||||
.arg("src_kind", memkind2str(event.src_kind)) | |||||
.arg("dst_kind", memkind2str(event.dst_kind)); | |||||
} else if constexpr (std::is_same_v<TEvent, CUPTIMemsetEvent>) { | |||||
new_cupti_event("Memset", 'B', event.stream, event.start) | |||||
.arg("value", imperative::to_string(event.value)) | |||||
.arg("bytes", imperative::to_string(event.bytes)); | |||||
new_cupti_event("Memset", 'E', event.stream, event.start) | |||||
.arg("value", imperative::to_string(event.value)) | |||||
.arg("bytes", imperative::to_string(event.bytes)); | |||||
} else if constexpr (std::is_same_v<TEvent, CUPTIRuntimeEvent>) { | |||||
new_host_event(event.name, 'B'); | |||||
} else if constexpr (std::is_same_v<TEvent, CUPTIRuntimeFinishEvent>) { | |||||
new_host_event(event.name, 'E'); | |||||
} else if constexpr (std::is_same_v<TEvent, CUPTIDriverEvent>) { | |||||
new_host_event(event.name, 'B'); | |||||
new_host_event(pid_str, 's') | |||||
.id(event.correlation_id) | |||||
.cat("CUPTILink") | |||||
.scope(pid_str); | |||||
} else if constexpr (std::is_same_v<TEvent, CUPTIDriverFinishEvent>) { | |||||
new_host_event(event.name, 'E'); | |||||
} | } | ||||
} | } | ||||
@@ -403,7 +485,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
if (thread_dict.count(host)) { | if (thread_dict.count(host)) { | ||||
trace_events.new_event() | trace_events.new_event() | ||||
.name("thread_name") | .name("thread_name") | ||||
.pid('M') | |||||
.ph('M') | |||||
.pid(pid) | |||||
.tid(to_tid(host)) | .tid(to_tid(host)) | ||||
.arg("name", thread_dict.at(host)); | .arg("name", thread_dict.at(host)); | ||||
} | } | ||||
@@ -411,7 +494,8 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
for (auto&& device : devices()) { | for (auto&& device : devices()) { | ||||
trace_events.new_event() | trace_events.new_event() | ||||
.name("thread_name") | .name("thread_name") | ||||
.pid('M') | |||||
.ph('M') | |||||
.pid(pid) | |||||
.tid(to_tid(device)) | .tid(to_tid(device)) | ||||
.arg("name", device.to_string_logical()); | .arg("name", device.to_string_logical()); | ||||
} | } | ||||
@@ -419,7 +503,7 @@ struct ChromeTimelineEventVisitor : EventVisitor<ChromeTimelineEventVisitor> { | |||||
}; | }; | ||||
void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) { | void dump_chrome_timeline(std::string filename, Profiler::bundle_t result) { | ||||
ChromeTimelineEventVisitor visitor; | |||||
ChromeTimelineEventVisitor visitor{}; | |||||
visitor.process_events(result); | visitor.process_events(result); | ||||
visitor.name_threads(result.thread_dict); | visitor.name_threads(result.thread_dict); | ||||
auto trace_events = std::move(visitor.trace_events); | auto trace_events = std::move(visitor.trace_events); | ||||
@@ -16,6 +16,7 @@ | |||||
#include "../interpreter/stack_manager.h" | #include "../interpreter/stack_manager.h" | ||||
#include "../op_trait.h" | #include "../op_trait.h" | ||||
#include "megbrain/imperative/cpp_cupti.h" | |||||
namespace mgb::imperative::profiler { | namespace mgb::imperative::profiler { | ||||
@@ -181,6 +182,60 @@ DEF_DUR_EVENT(HostToDevice, { | |||||
void* device_ptr; | void* device_ptr; | ||||
}); | }); | ||||
// cupti events | |||||
DEF_EVENT(CUPTITimestamp, { cupti::clock::time_point timestamp; }); | |||||
DEF_DUR_EVENT(CUPTIKernelLaunch, { | |||||
uint32_t correlation_id; | |||||
const char* name; | |||||
}); | |||||
DEF_EVENT(CUPTIKernelExecute, { | |||||
uint32_t correlation_id; | |||||
const char* name; | |||||
cupti::stream_t stream; | |||||
cupti::time_point start; | |||||
cupti::time_point end; | |||||
}); | |||||
DEF_DUR_EVENT(CUPTIMemcpyLaunch, { uint32_t correlation_id; }); | |||||
DEF_EVENT(CUPTIMemcpy, { | |||||
uint32_t correlation_id; | |||||
uint8_t src_kind; | |||||
uint8_t dst_kind; | |||||
uint64_t bytes; | |||||
cupti::stream_t stream; | |||||
cupti::time_point start; | |||||
cupti::time_point end; | |||||
}); | |||||
DEF_EVENT(CUPTIMemset, { | |||||
uint32_t correlation_id; | |||||
uint32_t value; | |||||
uint64_t bytes; | |||||
cupti::stream_t stream; | |||||
cupti::time_point start; | |||||
cupti::time_point end; | |||||
}); | |||||
DEF_EVENT(CUPTIUnknownDevice, {}); | |||||
DEF_DUR_EVENT(CUPTIRuntime, { | |||||
uint32_t correlation_id; | |||||
const char* name; | |||||
}); | |||||
DEF_DUR_EVENT(CUPTIDriver, { | |||||
uint32_t correlation_id; | |||||
const char* name; | |||||
}); | |||||
DEF_EVENT(CUPTIIdentifyStream, { | |||||
cupti::stream_t stream; | |||||
CompNode device; | |||||
}); | |||||
#undef DEF_EVENT | #undef DEF_EVENT | ||||
#undef DEF_DUR_EVENT | #undef DEF_DUR_EVENT | ||||
@@ -180,10 +180,13 @@ private: | |||||
HostTime m_start_time; | HostTime m_start_time; | ||||
CompNode::UnorderedMap<size_t> m_device_tid_table; | CompNode::UnorderedMap<size_t> m_device_tid_table; | ||||
std::unordered_map<std::thread::id, size_t> m_host_tid_table; | std::unordered_map<std::thread::id, size_t> m_host_tid_table; | ||||
std::unordered_map<cupti::stream_t, size_t> m_cupti_tid_table; | |||||
CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> | CompNode::UnorderedMap<std::map<profiler::HostTime, profiler::RealDuration>> | ||||
m_device_timeline; | m_device_timeline; | ||||
std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack; | std::unordered_map<std::thread::id, std::vector<Trace>> m_trace_stack; | ||||
std::unordered_map<std::string, int64_t> m_counter_table; | std::unordered_map<std::string, int64_t> m_counter_table; | ||||
std::optional<std::pair<profiler::HostTime, cupti::time_point>> m_cupti_timestamp = | |||||
{}; | |||||
protected: | protected: | ||||
Profiler::Record* current; | Profiler::Record* current; | ||||
@@ -191,6 +194,11 @@ protected: | |||||
ProfileTensorState* current_tensor; | ProfileTensorState* current_tensor; | ||||
protected: | protected: | ||||
size_t next_tid() { | |||||
return m_host_tid_table.size() + m_device_tid_table.size() + | |||||
m_cupti_tid_table.size(); | |||||
} | |||||
profiler::Duration since_start(profiler::HostTime time) { | profiler::Duration since_start(profiler::HostTime time) { | ||||
return time - m_start_time; | return time - m_start_time; | ||||
} | } | ||||
@@ -229,6 +237,10 @@ protected: | |||||
size_t to_tid(CompNode device) { return m_device_tid_table.at(device); } | size_t to_tid(CompNode device) { return m_device_tid_table.at(device); } | ||||
size_t to_tid(cupti::stream_t cupti_stream) { | |||||
return m_cupti_tid_table.at(cupti_stream); | |||||
} | |||||
SmallVector<std::thread::id> host_threads() { | SmallVector<std::thread::id> host_threads() { | ||||
SmallVector<std::thread::id> host_threads; | SmallVector<std::thread::id> host_threads; | ||||
for (auto&& [host, _] : m_host_tid_table) { | for (auto&& [host, _] : m_host_tid_table) { | ||||
@@ -254,6 +266,13 @@ protected: | |||||
value += delta; | value += delta; | ||||
} | } | ||||
profiler::HostTime time_from_cupti(cupti::time_point timestamp) { | |||||
mgb_assert(m_cupti_timestamp.has_value()); | |||||
return m_cupti_timestamp->first + | |||||
std::chrono::duration_cast<profiler::HostTime::duration>( | |||||
timestamp - m_cupti_timestamp->second); | |||||
} | |||||
public: | public: | ||||
void process_events(Profiler::bundle_t& bundle) { | void process_events(Profiler::bundle_t& bundle) { | ||||
m_start_time = bundle.start_at; | m_start_time = bundle.start_at; | ||||
@@ -272,7 +291,11 @@ public: | |||||
TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, | TensorCommandEvent, TensorCommandFinishEvent, AutoEvictEvent, | ||||
AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, | AutoEvictFinishEvent, CustomEvent, CustomFinishEvent, RecordDeviceEvent, | ||||
ScopeEvent, ScopeFinishEvent, HostToDeviceEvent, | ScopeEvent, ScopeFinishEvent, HostToDeviceEvent, | ||||
HostToDeviceFinishEvent> | |||||
HostToDeviceFinishEvent, CUPTITimestampEvent, CUPTIKernelLaunchEvent, | |||||
CUPTIKernelLaunchFinishEvent, CUPTIKernelExecuteEvent, | |||||
CUPTIMemcpyLaunchEvent, CUPTIMemcpyLaunchFinishEvent, CUPTIMemcpyEvent, | |||||
CUPTIRuntimeEvent, CUPTIRuntimeFinishEvent, CUPTIDriverEvent, | |||||
CUPTIDriverFinishEvent, CUPTIMemsetEvent> | |||||
converter; | converter; | ||||
auto for_each_entry = [&](auto&& handler) { | auto for_each_entry = [&](auto&& handler) { | ||||
@@ -289,7 +312,9 @@ public: | |||||
std::shared_ptr<CompNode::Event> device; | std::shared_ptr<CompNode::Event> device; | ||||
}; | }; | ||||
CompNode::UnorderedMap<DeviceStartPair> device_start_table; | CompNode::UnorderedMap<DeviceStartPair> device_start_table; | ||||
std::unordered_map<cupti::stream_t, CompNode> cupti_stream_table; | |||||
// record device time | |||||
for_each_entry([&](auto&& event) { | for_each_entry([&](auto&& event) { | ||||
using T = std::decay_t<decltype(event)>; | using T = std::decay_t<decltype(event)>; | ||||
if constexpr (std::is_same_v<T, RecordDeviceEvent>) { | if constexpr (std::is_same_v<T, RecordDeviceEvent>) { | ||||
@@ -313,8 +338,7 @@ public: | |||||
// register host threads | // register host threads | ||||
for_each_entry([&](auto&& event) { | for_each_entry([&](auto&& event) { | ||||
if (!m_host_tid_table.count(current->tid)) { | if (!m_host_tid_table.count(current->tid)) { | ||||
m_host_tid_table[current->tid] = { | |||||
m_device_tid_table.size() + m_host_tid_table.size()}; | |||||
m_host_tid_table[current->tid] = next_tid(); | |||||
} | } | ||||
}); | }); | ||||
@@ -340,14 +364,39 @@ public: | |||||
} else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | } else if constexpr (std::is_same_v<T, TensorProduceEvent>) { | ||||
auto& tensor = m_tensors[event.tensor_id]; | auto& tensor = m_tensors[event.tensor_id]; | ||||
if (!m_device_tid_table.count(event.device)) { | if (!m_device_tid_table.count(event.device)) { | ||||
m_device_tid_table[event.device] = { | |||||
m_device_tid_table.size() + m_host_tid_table.size()}; | |||||
m_device_tid_table[event.device] = next_tid(); | |||||
} | } | ||||
tensor.device = event.device; | tensor.device = event.device; | ||||
tensor.layout = event.layout; | tensor.layout = event.layout; | ||||
} | } | ||||
}); | }); | ||||
for_each_entry([&](auto&& event) { | |||||
using T = std::decay_t<decltype(event)>; | |||||
if constexpr (std::is_same_v<T, CUPTIIdentifyStreamEvent>) { | |||||
if (!m_cupti_tid_table.count(event.stream)) { | |||||
m_cupti_tid_table[event.stream] = | |||||
m_device_tid_table.at(event.device); | |||||
} | |||||
} | |||||
}); | |||||
// record cupti streams | |||||
for_each_entry([&](auto&& event) { | |||||
using T = std::decay_t<decltype(event)>; | |||||
if constexpr ( | |||||
std::is_same_v<T, CUPTIKernelExecuteEvent> || | |||||
std::is_same_v<T, CUPTIMemcpyEvent> || | |||||
std::is_same_v<T, CUPTIMemsetEvent>) { | |||||
if (!m_cupti_tid_table.count(event.stream)) { | |||||
m_cupti_tid_table[event.stream] = next_tid(); | |||||
} | |||||
} else if constexpr (std::is_same_v<T, CUPTITimestampEvent>) { | |||||
mgb_assert(!m_cupti_timestamp.has_value()); | |||||
m_cupti_timestamp.emplace(current->time, event.timestamp); | |||||
} | |||||
}); | |||||
// replay execution | // replay execution | ||||
using namespace std::placeholders; | using namespace std::placeholders; | ||||
for_each_entry([&](auto&& event) { | for_each_entry([&](auto&& event) { | ||||
@@ -0,0 +1,25 @@ | |||||
#include "megbrain/imperative/utils/platform.h" | |||||
#ifdef __GNUG__ | |||||
#include <cxxabi.h> | |||||
#include <cstdlib> | |||||
#include <memory> | |||||
#endif | |||||
using namespace mgb; | |||||
using namespace imperative; | |||||
/* | |||||
* demangle typeid, see | |||||
* http://stackoverflow.com/questions/281818/unmangling-the-result-of-stdtype-infoname | |||||
*/ | |||||
std::string mgb::imperative::demangle(std::string mangled) { | |||||
#ifdef __GNUG__ | |||||
int status = -1; | |||||
std::unique_ptr<char, void (*)(void*)> res{ | |||||
abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status), std::free}; | |||||
return (status == 0) ? res.get() : mangled; | |||||
#else | |||||
return mangled; | |||||
#endif | |||||
} |
@@ -0,0 +1,86 @@ | |||||
#pragma once | |||||
#include <chrono> | |||||
#include <ctime> | |||||
#include "megbrain/common.h" | |||||
#include "megbrain/imperative/utils/to_string.h" | |||||
namespace mgb::imperative::cupti { | |||||
struct clock { | |||||
typedef std::chrono::nanoseconds duration; | |||||
typedef duration::rep rep; | |||||
typedef duration::period period; | |||||
typedef std::chrono::time_point<clock> time_point; | |||||
static const bool is_steady = false; | |||||
static time_point now() /* noexcept */; | |||||
}; | |||||
using time_point = clock::time_point; | |||||
using duration = clock::duration; | |||||
struct device_t { | |||||
uint32_t device_id; | |||||
bool operator==(const device_t& rhs) const { return device_id == rhs.device_id; } | |||||
}; | |||||
struct context_t : device_t { | |||||
uint32_t context_id; | |||||
bool operator==(const context_t& rhs) const { | |||||
return device_t::operator==(rhs) && context_id == rhs.context_id; | |||||
} | |||||
}; | |||||
struct stream_t : context_t { | |||||
uint32_t stream_id; | |||||
bool operator==(const stream_t& rhs) const { | |||||
return context_t::operator==(rhs) && stream_id == rhs.stream_id; | |||||
} | |||||
}; | |||||
bool available(); | |||||
void enable(); | |||||
void disable(); | |||||
void flush(); | |||||
bool enabled(); | |||||
template <typename TActivity> | |||||
struct activity { | |||||
private: | |||||
TActivity* m_ptr; | |||||
public: | |||||
activity(void* ptr) : m_ptr((TActivity*)ptr) {} | |||||
time_point start() const { return time_point(duration(m_ptr->start)); } | |||||
time_point end() const { return time_point(duration(m_ptr->end)); } | |||||
device_t device() const { return {m_ptr->deviceId}; } | |||||
context_t context() const { return {device(), m_ptr->contextId}; } | |||||
stream_t stream() const { return {context(), m_ptr->streamId}; } | |||||
TActivity* operator->() const { return m_ptr; } | |||||
}; | |||||
} // namespace mgb::imperative::cupti | |||||
template <> | |||||
class std::hash<mgb::imperative::cupti::stream_t> { | |||||
public: | |||||
size_t operator()(const mgb::imperative::cupti::stream_t& value) const { | |||||
return value.stream_id; | |||||
} | |||||
}; |
@@ -194,16 +194,9 @@ public: | |||||
static bool is_profiling() { return sm_profiling; } | static bool is_profiling() { return sm_profiling; } | ||||
static void start_profile() { | |||||
mgb_assert(!sm_profiling); | |||||
sm_start_at = Timer::record_host(); | |||||
sm_profiling = true; | |||||
} | |||||
static void start_profile(); | |||||
static void stop_profile() { | |||||
mgb_assert(sm_profiling); | |||||
sm_profiling = false; | |||||
} | |||||
static void stop_profile(); | |||||
static thread_dict_t get_thread_dict(); | static thread_dict_t get_thread_dict(); | ||||
@@ -0,0 +1,9 @@ | |||||
#pragma once | |||||
#include <string> | |||||
namespace mgb::imperative { | |||||
std::string demangle(std::string mangled); | |||||
} |
@@ -37,6 +37,10 @@ if(MGE_WITH_CUDA) | |||||
list(APPEND LINK_LIBS cudart) | list(APPEND LINK_LIBS cudart) | ||||
endif() | endif() | ||||
if(MGE_WITH_CUPTI) | |||||
list(APPEND LINK_LIBS libcupti) | |||||
endif() | |||||
if(MGE_WITH_DISTRIBUTED) | if(MGE_WITH_DISTRIBUTED) | ||||
list(APPEND LINK_LIBS megray) | list(APPEND LINK_LIBS megray) | ||||
endif() | endif() | ||||
@@ -61,11 +61,11 @@ echo "Build with ${SDK_NAME}" | |||||
if [ $SDK_NAME == "cu101" ];then | if [ $SDK_NAME == "cu101" ];then | ||||
CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | ||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||||
BUILD_GCC8="ON" | |||||
REQUIR_CUDA_VERSION="10010" | |||||
REQUIR_CUDNN_VERSION="7.6.3" | |||||
REQUIR_TENSORRT_VERSION="6.0.1.5" | |||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||||
BUILD_GCC8="ON" | |||||
REQUIR_CUDA_VERSION="10010" | |||||
REQUIR_CUDNN_VERSION="7.6.3" | |||||
REQUIR_TENSORRT_VERSION="6.0.1.5" | |||||
REQUIR_CUBLAS_VERSION="10.2.1.243" | REQUIR_CUBLAS_VERSION="10.2.1.243" | ||||
elif [ $SDK_NAME == "cu102_JetsonNano" ];then | elif [ $SDK_NAME == "cu102_JetsonNano" ];then | ||||
@@ -87,6 +87,12 @@ elif [ $SDK_NAME == "cu102_JetsonNano" ];then | |||||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ||||
${CUDNN_LIB_DIR}/libcudnn.so.8" | ${CUDNN_LIB_DIR}/libcudnn.so.8" | ||||
if [ ${machine} == "aarch64" ];then | |||||
CUDA_COPY_LIB_LIST="\ | |||||
${CUDA_LIB_DIR}/libcupti.so.10.2:\ | |||||
${CUDA_COPY_LIB_LIST}" | |||||
fi | |||||
EXTRA_CMAKE_FLAG="-DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_53,code=sm_53\" " | EXTRA_CMAKE_FLAG="-DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_53,code=sm_53\" " | ||||
elif [ $SDK_NAME == "cu111" ];then | elif [ $SDK_NAME == "cu111" ];then | ||||
@@ -118,6 +124,12 @@ elif [ $SDK_NAME == "cu111" ];then | |||||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ||||
${CUDNN_LIB_DIR}/libcudnn.so.8" | ${CUDNN_LIB_DIR}/libcudnn.so.8" | ||||
if [ ${machine} == "aarch64" ];then | |||||
CUDA_COPY_LIB_LIST="\ | |||||
${CUDA_LIB_DIR}/libcupti.so.11.1:\ | |||||
${CUDA_COPY_LIB_LIST}" | |||||
fi | |||||
if [ ${IN_CI} = "true" ] && [ ${machine} == "aarch64" ]; then | if [ ${IN_CI} = "true" ] && [ ${machine} == "aarch64" ]; then | ||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_75,code=sm_75\" " | EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_75,code=sm_75\" " | ||||
else | else | ||||
@@ -152,9 +164,9 @@ elif [ $SDK_NAME == "cu112" ];then | |||||
-gencode arch=compute_86,code=sm_86 \ | -gencode arch=compute_86,code=sm_86 \ | ||||
-gencode arch=compute_86,code=compute_86\" " | -gencode arch=compute_86,code=compute_86\" " | ||||
REQUIR_CUDA_VERSION="11020" | |||||
REQUIR_CUDNN_VERSION="8.0.4" | |||||
REQUIR_TENSORRT_VERSION="7.2.2.3" | |||||
REQUIR_CUDA_VERSION="11020" | |||||
REQUIR_CUDNN_VERSION="8.0.4" | |||||
REQUIR_TENSORRT_VERSION="7.2.2.3" | |||||
REQUIR_CUBLAS_VERSION="11.3.1.68" | REQUIR_CUBLAS_VERSION="11.3.1.68" | ||||
elif [ $SDK_NAME == "cpu" ];then | elif [ $SDK_NAME == "cpu" ];then | ||||
@@ -35,6 +35,7 @@ | |||||
#cmakedefine01 MGB_ENABLE_FBS_SERIALIZATION | #cmakedefine01 MGB_ENABLE_FBS_SERIALIZATION | ||||
#cmakedefine01 MGB_IS_DEV | #cmakedefine01 MGB_IS_DEV | ||||
#cmakedefine01 MGB_CUSTOM_OP | #cmakedefine01 MGB_CUSTOM_OP | ||||
#cmakedefine01 MGB_CUPTI | |||||
// DNN related flags | // DNN related flags | ||||
// Platform macro's | // Platform macro's | ||||
#cmakedefine01 MEGDNN_WITH_CUDA | #cmakedefine01 MEGDNN_WITH_CUDA | ||||