@@ -39,6 +39,9 @@ option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF) | |||||
option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) | option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) | ||||
option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | ||||
option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | ||||
option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) | |||||
option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) | |||||
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF) | |||||
option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | ||||
option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | ||||
option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | ||||
@@ -55,6 +58,14 @@ option(MGE_BUILD_SDK "Build load_and_run" ON) | |||||
option(MGE_INFERENCE_ONLY "Build inference only library." OFF) | option(MGE_INFERENCE_ONLY "Build inference only library." OFF) | ||||
option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON) | option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON) | ||||
option(MGE_WITH_ROCM "Enable ROCM support" OFF) | option(MGE_WITH_ROCM "Enable ROCM support" OFF) | ||||
option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) | |||||
if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) | |||||
set(MGE_WITH_ANY_CUDA_STUB ON) | |||||
else() | |||||
set(MGE_WITH_ANY_CUDA_STUB OFF) | |||||
endif() | |||||
if(NOT ${MGE_BIN_REDUCE} STREQUAL "") | if(NOT ${MGE_BIN_REDUCE} STREQUAL "") | ||||
message(STATUS "build with BIN REDUCE") | message(STATUS "build with BIN REDUCE") | ||||
@@ -205,14 +216,24 @@ else() | |||||
endif() | endif() | ||||
endif() | endif() | ||||
if(MGE_WITH_CUDA) | |||||
include(cmake/cudnn.cmake) | |||||
if(MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||||
message(WARNING "Static link CUDNN8 will auto enable MGE_WITH_LARGE_ARCHIVE=ON") | |||||
set(MGE_WITH_LARGE_ARCHIVE ON) | |||||
endif() | |||||
endif() | |||||
CHECK_CXX_COMPILER_FLAG(-fuse-ld=gold CXX_SUPPORT_GOLD) | CHECK_CXX_COMPILER_FLAG(-fuse-ld=gold CXX_SUPPORT_GOLD) | ||||
if(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32) | |||||
if(MGE_WITH_LARGE_ARCHIVE) | |||||
message(STATUS "Set -mcmodel=large and disable -fuse-ld=gold") | |||||
set(MGE_COMMON_LINKER_FLAGS "-mcmodel=large") | |||||
elseif(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32 AND NOT MGE_WITH_LARGE_ARCHIVE) | |||||
message(STATUS "Using GNU gold linker.") | message(STATUS "Using GNU gold linker.") | ||||
set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold") | |||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold") | |||||
endif() | endif() | ||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||||
if(NOT MGE_WITH_JIT) | if(NOT MGE_WITH_JIT) | ||||
if(MGE_WITH_HALIDE) | if(MGE_WITH_HALIDE) | ||||
@@ -353,11 +374,28 @@ if(MGE_WITH_CUDA) | |||||
if(NOT MGE_ENABLE_EXCEPTIONS) | if(NOT MGE_ENABLE_EXCEPTIONS) | ||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions") | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions") | ||||
endif() | endif() | ||||
if(NOT MGE_CUDA_GENCODE) | if(NOT MGE_CUDA_GENCODE) | ||||
if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") | if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") | ||||
set(MEGDNN_THREADS_512 0) | set(MEGDNN_THREADS_512 0) | ||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") | |||||
if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||||
message(WARNING "Static link CUDNN8 with many sm is unworkable, we only enable sm61 sm70 sm75 by default, and enable MGE_WITH_LARGE_ARCHIVE=ON") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||||
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.1.0") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=sm_86") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=compute_86") | |||||
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.0.0") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=compute_80") | |||||
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") | |||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") | set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") | ||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") | set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") | ||||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | ||||
@@ -385,7 +423,6 @@ if(MGE_WITH_CUDA) | |||||
endif() | endif() | ||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}") | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}") | ||||
include(cmake/cudnn.cmake) | |||||
if(MGE_WITH_TRT) | if(MGE_WITH_TRT) | ||||
include(cmake/tensorrt.cmake) | include(cmake/tensorrt.cmake) | ||||
endif() | endif() | ||||
@@ -394,12 +431,30 @@ if(MGE_WITH_CUDA) | |||||
if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY}) | list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY}) | ||||
message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}") | message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}") | ||||
else() | |||||
if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7) | |||||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer myelin_compiler_static myelin_executor_static myelin_pattern_runtime_static myelin_pattern_library_static -Wl,--no-whole-archive) | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer -Wl,--no-whole-archive) | |||||
endif() | |||||
endif() | |||||
endif() | |||||
if("${CUDNN_VERSION}" STREQUAL "7.5.0") | |||||
if(MSVC OR WIN32) | |||||
message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | ||||
list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) | |||||
else() | else() | ||||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive) | |||||
message(STATUS "cudnn 7.5.0 has bug in cudnnConvolutionBiasActivationForward, need --whole-archive to workaround, ref https://docs.nvidia.com/deeplearning/cudnn/release-notes/rel_7xx.html") | |||||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) | |||||
endif() | endif() | ||||
else() | else() | ||||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) | |||||
if(MSVC OR WIN32) | |||||
message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | |||||
list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS libcudnn) | |||||
endif() | |||||
endif() | endif() | ||||
if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | ||||
@@ -447,15 +502,37 @@ if(MGE_WITH_CUDA) | |||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | ||||
list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand) | list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand) | ||||
endif() | endif() | ||||
list(APPEND MGE_CUDA_LIBS cudart) | |||||
endif() | |||||
if(NOT MGE_WITH_CUDA_STUB) | |||||
if(MSVC OR WIN32) | |||||
list(APPEND MGE_CUDA_LIBS cuda.lib) | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS cuda) | |||||
endif() | |||||
endif() | |||||
if(NOT MGE_WITH_NVRTC_STUB) | |||||
if(MSVC OR WIN32) | |||||
list(APPEND MGE_CUDA_LIBS nvrtc.lib) | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS nvrtc) | |||||
endif() | |||||
endif() | |||||
if(MGE_WITH_ANY_CUDA_STUB) | |||||
add_subdirectory(dnn/cuda-stub) | |||||
list(APPEND MGE_CUDA_LIBS cuda-stub) | |||||
endif() | endif() | ||||
add_subdirectory(dnn/cuda-stub) | |||||
if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
list(APPEND MGE_CUDA_LIBS nvrtc.lib cuda-stub) | |||||
list(APPEND MGE_CUDA_LIBS nvrtc.lib) | |||||
else() | else() | ||||
list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt) | |||||
list(APPEND MGE_CUDA_LIBS nvToolsExt) | |||||
endif() | endif() | ||||
set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}") | |||||
set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -lrt") | |||||
endif() | endif() | ||||
if(MGE_WITH_CAMBRICON) | if(MGE_WITH_CAMBRICON) | ||||
@@ -800,6 +877,9 @@ if(TARGET _imperative_rt) | |||||
COMMAND ${CMAKE_COMMAND} -E create_symlink | COMMAND ${CMAKE_COMMAND} -E create_symlink | ||||
${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | ${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | ||||
${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | ${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | ||||
COMMAND ${CMAKE_COMMAND} -E create_symlink | |||||
${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py | |||||
${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py | |||||
DEPENDS _imperative_rt | DEPENDS _imperative_rt | ||||
VERBATIM | VERBATIM | ||||
) | ) | ||||
@@ -863,3 +943,9 @@ if(MGE_WITH_JIT_MLIR) | |||||
add_subdirectory(tools/mlir/mgb-opt) | add_subdirectory(tools/mlir/mgb-opt) | ||||
add_subdirectory(tools/mlir/mgb-file-check) | add_subdirectory(tools/mlir/mgb-file-check) | ||||
endif() | endif() | ||||
if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||||
message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||||
message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||||
message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||||
endif() |
@@ -11,7 +11,7 @@ if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") | |||||
set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | ||||
endif() | endif() | ||||
if(MGE_CUDA_USE_STATIC) | |||||
if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) | |||||
find_library(CUDNN_LIBRARY | find_library(CUDNN_LIBRARY | ||||
NAMES libcudnn_static.a cudnn.lib | NAMES libcudnn_static.a cudnn.lib | ||||
PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} | PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} | ||||
@@ -42,7 +42,12 @@ if(CUDNN_INCLUDE_DIR STREQUAL "CUDNN_INCLUDE_DIR-NOTFOUND") | |||||
message(FATAL_ERROR "Can not find CuDNN Library") | message(FATAL_ERROR "Can not find CuDNN Library") | ||||
endif() | endif() | ||||
file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) | |||||
if(EXISTS ${CUDNN_INCLUDE_DIR}/cudnn_version.h) | |||||
file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS) | |||||
else() | |||||
file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) | |||||
endif() | |||||
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" | string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" | ||||
CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | ||||
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" | string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" | ||||
@@ -55,7 +60,9 @@ string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)" | |||||
CUDNN_PATCH_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | CUDNN_PATCH_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | ||||
string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" | string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" | ||||
CUDNN_PATCH_VERSION "${CUDNN_PATCH_VERSION}") | CUDNN_PATCH_VERSION "${CUDNN_PATCH_VERSION}") | ||||
set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}) | |||||
set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCH_VERSION}) | |||||
if(MGE_CUDA_USE_STATIC) | if(MGE_CUDA_USE_STATIC) | ||||
add_library(libcudnn STATIC IMPORTED) | add_library(libcudnn STATIC IMPORTED) | ||||
@@ -1,11 +1,21 @@ | |||||
file (GLOB_RECURSE SOURCES src/*.cpp) | |||||
file (GLOB_RECURSE CUDA_STUB src/libcuda.cpp) | |||||
file (GLOB_RECURSE NVRTC_STUB src/libnvrtc.cpp) | |||||
if(MGE_WITH_CUDA_STUB) | |||||
list(APPEND STUB_SRC ${CUDA_STUB}) | |||||
endif() | |||||
if(MGE_WITH_NVRTC_STUB) | |||||
list(APPEND STUB_SRC ${NVRTC_STUB}) | |||||
endif() | |||||
if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
add_library (cuda-stub STATIC ${SOURCES}) | |||||
add_library (cuda-stub STATIC ${STUB_SRC}) | |||||
else() | else() | ||||
add_library (cuda-stub SHARED ${SOURCES}) | |||||
add_library (cuda-stub SHARED ${STUB_SRC}) | |||||
endif() | endif() | ||||
set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda) | |||||
set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda_stub) | |||||
target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) | target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) | ||||
if (MSVC OR WIN32) | if (MSVC OR WIN32) | ||||
target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined) | target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined) | ||||
@@ -0,0 +1,109 @@ | |||||
#if defined(_WIN32) | |||||
#include <windows.h> | |||||
#define RTLD_LAZY 0 | |||||
static void* dlopen(const char* file, int) { | |||||
return static_cast<void*>(LoadLibraryA(file)); | |||||
} | |||||
static void* dlerror() { | |||||
const char* errmsg = "dlerror not aviable in windows"; | |||||
return const_cast<char*>(errmsg); | |||||
} | |||||
static void* dlsym(void* handle, const char* name) { | |||||
FARPROC symbol = GetProcAddress((HMODULE)handle, name); | |||||
return reinterpret_cast<void*>(symbol); | |||||
} | |||||
#else | |||||
#include <dlfcn.h> | |||||
#include <unistd.h> | |||||
#endif | |||||
#include <sstream> | |||||
#include <string> | |||||
#include <vector> | |||||
static std::vector<std::string> split_string(const std::string& s, char delim) { | |||||
std::vector<std::string> elems; | |||||
std::stringstream ss(s); | |||||
std::string item; | |||||
while (std::getline(ss, item, delim)) { | |||||
elems.push_back(item); | |||||
} | |||||
return elems; | |||||
} | |||||
static std::vector<std::string> get_env_dir(const char* env_name) { | |||||
const char* env_p = std::getenv(env_name); | |||||
std::vector<std::string> env_dir; | |||||
if (env_p) { | |||||
env_dir = split_string(env_p, ':'); | |||||
} | |||||
return env_dir; | |||||
} | |||||
static void* try_open_handle(std::vector<std::string> dir_vec, | |||||
std::string default_so_name) { | |||||
void* handle = nullptr; | |||||
for (auto& tk_path : dir_vec) { | |||||
handle = dlopen((tk_path + "/" + default_so_name).c_str(), RTLD_LAZY); | |||||
if (handle) { | |||||
break; | |||||
} | |||||
} | |||||
return handle; | |||||
} | |||||
static void* try_open_handle(const char** so_vec, int nr_so) { | |||||
void* handle = nullptr; | |||||
for (int i = 0; i < nr_so; ++i) { | |||||
handle = dlopen(so_vec[i], RTLD_LAZY); | |||||
if (handle) { | |||||
break; | |||||
} | |||||
} | |||||
return handle; | |||||
} | |||||
static void* get_library_handle() { | |||||
std::vector<std::string> cuda_tk_dir = get_env_dir("CUDA_TK_PATH"); | |||||
std::vector<std::string> ld_dir = get_env_dir("LD_LIBRARY_PATH"); | |||||
void* handle = nullptr; | |||||
if (!handle) { | |||||
handle = try_open_handle(ld_dir, default_so_name); | |||||
} | |||||
if (!handle) { | |||||
handle = try_open_handle(cuda_tk_dir, default_so_name); | |||||
} | |||||
if (!handle) { | |||||
handle = try_open_handle(default_so_paths, | |||||
sizeof(default_so_paths) / sizeof(char*)); | |||||
} | |||||
if (!handle) { | |||||
handle = try_open_handle(extra_so_paths, | |||||
sizeof(extra_so_paths) / sizeof(char*)); | |||||
} | |||||
if (!handle) { | |||||
LOGE("Failed to load %s API library", g_default_api_name); | |||||
return nullptr; | |||||
} | |||||
return handle; | |||||
} | |||||
static void log_failed_load(int func_idx) { | |||||
LOGE("failed to load %s func: %s", g_default_api_name, | |||||
g_func_name[func_idx]); | |||||
} | |||||
static void* resolve_library_func(void* handle, const char* func) { | |||||
if (!handle) { | |||||
LOGE("%s handle should not be nullptr!", g_default_api_name); | |||||
return nullptr; | |||||
} | |||||
auto ret = dlsym(handle, func); | |||||
if (!ret) { | |||||
LOGE("failed to load %s func: %s", g_default_api_name, func); | |||||
} | |||||
return ret; | |||||
} |
@@ -3,36 +3,14 @@ | |||||
#include <cstdio> | #include <cstdio> | ||||
#define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) | #define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) | ||||
extern "C" { | extern "C" { | ||||
#include <cuda.h> | |||||
#include "cuda.h" | |||||
} | } | ||||
#include <cudaProfiler.h> | |||||
#include "cudaProfiler.h" | |||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" | #pragma GCC diagnostic ignored "-Wdeprecated-declarations" | ||||
#if defined(_WIN32) | |||||
#include <windows.h> | |||||
#define RTLD_LAZY 0 | |||||
static void* dlopen(const char* file, int) { | |||||
return static_cast<void*>(LoadLibraryA(file)); | |||||
} | |||||
static void* dlerror() { | |||||
const char* errmsg = "dlerror not aviable in windows"; | |||||
return const_cast<char*>(errmsg); | |||||
} | |||||
static void* dlsym(void* handle, const char* name) { | |||||
FARPROC symbol = GetProcAddress((HMODULE)handle, name); | |||||
return reinterpret_cast<void*>(symbol); | |||||
} | |||||
#else | |||||
#include <dlfcn.h> | |||||
#include <unistd.h> | |||||
#endif | |||||
static void log_failed_load(int func_idx); | static void log_failed_load(int func_idx); | ||||
namespace { | namespace { | ||||
template <typename T> | template <typename T> | ||||
@@ -42,68 +20,63 @@ CUresult on_init_failed(int func_idx) { | |||||
log_failed_load(func_idx); | log_failed_load(func_idx); | ||||
return CUDA_ERROR_UNKNOWN; | return CUDA_ERROR_UNKNOWN; | ||||
} | } | ||||
} | |||||
} // namespace | |||||
#define _WRAPLIB_API_CALL CUDAAPI | #define _WRAPLIB_API_CALL CUDAAPI | ||||
#define _WRAPLIB_CALLBACK CUDA_CB | #define _WRAPLIB_CALLBACK CUDA_CB | ||||
#include "./libcuda-wrap.h" | |||||
#if CUDA_VERSION == 10010 | |||||
#include "./libcuda-wrap_10.1.h" | |||||
#elif CUDA_VERSION == 10020 | |||||
#include "./libcuda-wrap_10.2.h" | |||||
#elif CUDA_VERSION == 11010 | |||||
#include "./libcuda-wrap_11.1.h" | |||||
#elif CUDA_VERSION == 11020 | |||||
#include "./libcuda-wrap_11.2.h" | |||||
#else | |||||
#error "cuda stub not support this cuda version, you can close cuda stub to passby" | |||||
#endif | |||||
#undef _WRAPLIB_CALLBACK | #undef _WRAPLIB_CALLBACK | ||||
#undef _WRAPLIB_API_CALL | #undef _WRAPLIB_API_CALL | ||||
static const char* default_so_name = | |||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||||
"nvcuda.dll"; | |||||
#elif defined(__APPLE__) || defined(__MACOSX) | |||||
"libcuda.dylib"; | |||||
#else | |||||
"libcuda.so.1"; | |||||
#endif | |||||
// Harvested from cuda_drvapi_dynlink.c | // Harvested from cuda_drvapi_dynlink.c | ||||
static const char* default_so_paths[] = { | static const char* default_so_paths[] = { | ||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | ||||
"nvcuda.dll", | |||||
#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX) | |||||
"nvcuda.dll", | |||||
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || \ | |||||
defined(__MACOSX) | |||||
#if defined(__APPLE__) || defined(__MACOSX) | #if defined(__APPLE__) || defined(__MACOSX) | ||||
"/usr/local/cuda/lib/libcuda.dylib", | |||||
"/usr/local/cuda/lib/libcuda.dylib", | |||||
#elif defined(__ANDROID__) | #elif defined(__ANDROID__) | ||||
#if defined (__aarch64__) | |||||
"/system/vendor/lib64/libcuda.so", | |||||
#if defined(__aarch64__) | |||||
"/system/vendor/lib64/libcuda.so", | |||||
#elif defined(__arm__) | #elif defined(__arm__) | ||||
"/system/vendor/lib/libcuda.so", | |||||
"/system/vendor/lib/libcuda.so", | |||||
#endif | #endif | ||||
#else | #else | ||||
"libcuda.so.1", | |||||
// In case some users does not have correct search path configured in | |||||
// /etc/ld.so.conf | |||||
"/usr/lib/x86_64-linux-gnu/libcuda.so", | |||||
"/usr/local/nvidia/lib64/libcuda.so", | |||||
"libcuda.so.1", | |||||
#endif | #endif | ||||
#else | #else | ||||
#error "Unknown platform" | #error "Unknown platform" | ||||
#endif | #endif | ||||
}; | }; | ||||
static void* get_library_handle() { | |||||
void* handle = nullptr; | |||||
for (size_t i = 0; i < (sizeof(default_so_paths) / sizeof(char*)); i++) { | |||||
handle = dlopen(default_so_paths[i], RTLD_LAZY); | |||||
if (handle) { | |||||
break; | |||||
} | |||||
} | |||||
if (!handle) { | |||||
LOGE("Failed to load CUDA Driver API library"); | |||||
return nullptr; | |||||
} | |||||
return handle; | |||||
} | |||||
static void log_failed_load(int func_idx) { | |||||
LOGE("failed to load cuda func: %s", g_func_name[func_idx]); | |||||
} | |||||
static const char* extra_so_paths[] = { | |||||
"/usr/lib/x86_64-linux-gnu/libcuda.so", | |||||
"/usr/local/nvidia/lib64/libcuda.so", | |||||
}; | |||||
static void* resolve_library_func(void* handle, const char* func) { | |||||
if (!handle) { | |||||
LOGE("handle should not be nullptr!"); | |||||
return nullptr; | |||||
} | |||||
auto ret = dlsym(handle, func); | |||||
if (!ret) { | |||||
LOGE("failed to load cuda func: %s", func); | |||||
} | |||||
return ret; | |||||
} | |||||
static const char* g_default_api_name = "cuda"; | |||||
#include "./dlopen_helper.h" |
@@ -0,0 +1,367 @@ | |||||
// generated by wraplib.py | |||||
// --- begin functions to be implemented | |||||
#ifndef _WRAPLIB_API_CALL | |||||
#define _WRAPLIB_API_CALL | |||||
#endif | |||||
#ifndef _WRAPLIB_CALLBACK | |||||
#define _WRAPLIB_CALLBACK | |||||
#endif | |||||
#ifndef ON_ENTRY | |||||
#define ON_ENTRY(x) | |||||
#endif | |||||
static void* get_library_handle(); | |||||
static void* resolve_library_func(void*, const char*); | |||||
namespace { | |||||
template <typename T> | |||||
T on_init_failed(int func_idx); | |||||
} | |||||
// --- end functions to be implemented | |||||
#include <cstddef> | |||||
#include <mutex> | |||||
extern "C" { | |||||
const char _WRAPLIB_API_CALL* nvrtcGetErrorString(nvrtcResult arg0); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcVersion(int* arg0, int* arg1); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs(int* arg0); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs(int* arg0); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram(nvrtcProgram* arg0, | |||||
const char* arg1, | |||||
const char* arg2, int arg3, | |||||
const char* const* arg4, | |||||
const char* const* arg5); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram(nvrtcProgram* arg0); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcCompileProgram(nvrtcProgram arg0, int arg1, | |||||
const char* const* arg2); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize(nvrtcProgram arg0, size_t* arg1); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX(nvrtcProgram arg0, char* arg1); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize(nvrtcProgram arg0, | |||||
size_t* arg1); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN(nvrtcProgram arg0, char* arg1); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize(nvrtcProgram arg0, | |||||
size_t* arg1); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog(nvrtcProgram arg0, char* arg1); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcAddNameExpression(nvrtcProgram arg0, | |||||
const char* const arg1); | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName(nvrtcProgram arg0, | |||||
const char* const arg1, | |||||
const char** arg2); | |||||
} | |||||
static void load_library(); | |||||
static const char _WRAPLIB_API_CALL* nvrtcGetErrorString_init( | |||||
nvrtcResult arg0) { | |||||
load_library(); | |||||
return nvrtcGetErrorString(arg0); | |||||
} | |||||
static const char _WRAPLIB_API_CALL* nvrtcGetErrorString_error(nvrtcResult) { | |||||
return on_init_failed<const char*>(0); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcVersion_init(int* arg0, int* arg1) { | |||||
load_library(); | |||||
return nvrtcVersion(arg0, arg1); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcVersion_error(int*, int*) { | |||||
return on_init_failed<nvrtcResult>(1); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs_init(int* arg0) { | |||||
load_library(); | |||||
return nvrtcGetNumSupportedArchs(arg0); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs_error(int*) { | |||||
return on_init_failed<nvrtcResult>(2); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs_init(int* arg0) { | |||||
load_library(); | |||||
return nvrtcGetSupportedArchs(arg0); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs_error(int*) { | |||||
return on_init_failed<nvrtcResult>(3); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram_init( | |||||
nvrtcProgram* arg0, const char* arg1, const char* arg2, int arg3, | |||||
const char* const* arg4, const char* const* arg5) { | |||||
load_library(); | |||||
return nvrtcCreateProgram(arg0, arg1, arg2, arg3, arg4, arg5); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL | |||||
nvrtcCreateProgram_error(nvrtcProgram*, const char*, const char*, int, | |||||
const char* const*, const char* const*) { | |||||
return on_init_failed<nvrtcResult>(4); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL | |||||
nvrtcDestroyProgram_init(nvrtcProgram* arg0) { | |||||
load_library(); | |||||
return nvrtcDestroyProgram(arg0); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram_error(nvrtcProgram*) { | |||||
return on_init_failed<nvrtcResult>(5); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL | |||||
nvrtcCompileProgram_init(nvrtcProgram arg0, int arg1, const char* const* arg2) { | |||||
load_library(); | |||||
return nvrtcCompileProgram(arg0, arg1, arg2); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL | |||||
nvrtcCompileProgram_error(nvrtcProgram, int, const char* const*) { | |||||
return on_init_failed<nvrtcResult>(6); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize_init(nvrtcProgram arg0, | |||||
size_t* arg1) { | |||||
load_library(); | |||||
return nvrtcGetPTXSize(arg0, arg1); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize_error(nvrtcProgram, | |||||
size_t*) { | |||||
return on_init_failed<nvrtcResult>(7); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX_init(nvrtcProgram arg0, | |||||
char* arg1) { | |||||
load_library(); | |||||
return nvrtcGetPTX(arg0, arg1); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX_error(nvrtcProgram, char*) { | |||||
return on_init_failed<nvrtcResult>(8); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize_init(nvrtcProgram arg0, | |||||
size_t* arg1) { | |||||
load_library(); | |||||
return nvrtcGetCUBINSize(arg0, arg1); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize_error(nvrtcProgram, | |||||
size_t*) { | |||||
return on_init_failed<nvrtcResult>(9); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN_init(nvrtcProgram arg0, | |||||
char* arg1) { | |||||
load_library(); | |||||
return nvrtcGetCUBIN(arg0, arg1); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN_error(nvrtcProgram, char*) { | |||||
return on_init_failed<nvrtcResult>(10); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL | |||||
nvrtcGetProgramLogSize_init(nvrtcProgram arg0, size_t* arg1) { | |||||
load_library(); | |||||
return nvrtcGetProgramLogSize(arg0, arg1); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize_error(nvrtcProgram, | |||||
size_t*) { | |||||
return on_init_failed<nvrtcResult>(11); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog_init(nvrtcProgram arg0, | |||||
char* arg1) { | |||||
load_library(); | |||||
return nvrtcGetProgramLog(arg0, arg1); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog_error(nvrtcProgram, | |||||
char*) { | |||||
return on_init_failed<nvrtcResult>(12); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL | |||||
nvrtcAddNameExpression_init(nvrtcProgram arg0, const char* const arg1) { | |||||
load_library(); | |||||
return nvrtcAddNameExpression(arg0, arg1); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL | |||||
nvrtcAddNameExpression_error(nvrtcProgram, const char* const) { | |||||
return on_init_failed<nvrtcResult>(13); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName_init( | |||||
nvrtcProgram arg0, const char* const arg1, const char** arg2) { | |||||
load_library(); | |||||
return nvrtcGetLoweredName(arg0, arg1, arg2); | |||||
} | |||||
static nvrtcResult _WRAPLIB_API_CALL | |||||
nvrtcGetLoweredName_error(nvrtcProgram, const char* const, const char**) { | |||||
return on_init_failed<nvrtcResult>(14); | |||||
} | |||||
static constexpr size_t NR_FUNC = 15; | |||||
static void* g_func_table[NR_FUNC] = {(void*)(&nvrtcGetErrorString_init), | |||||
(void*)(&nvrtcVersion_init), | |||||
(void*)(&nvrtcGetNumSupportedArchs_init), | |||||
(void*)(&nvrtcGetSupportedArchs_init), | |||||
(void*)(&nvrtcCreateProgram_init), | |||||
(void*)(&nvrtcDestroyProgram_init), | |||||
(void*)(&nvrtcCompileProgram_init), | |||||
(void*)(&nvrtcGetPTXSize_init), | |||||
(void*)(&nvrtcGetPTX_init), | |||||
(void*)(&nvrtcGetCUBINSize_init), | |||||
(void*)(&nvrtcGetCUBIN_init), | |||||
(void*)(&nvrtcGetProgramLogSize_init), | |||||
(void*)(&nvrtcGetProgramLog_init), | |||||
(void*)(&nvrtcAddNameExpression_init), | |||||
(void*)(&nvrtcGetLoweredName_init)}; | |||||
static void* g_func_table_error[NR_FUNC] = { | |||||
(void*)(&nvrtcGetErrorString_error), | |||||
(void*)(&nvrtcVersion_error), | |||||
(void*)(&nvrtcGetNumSupportedArchs_error), | |||||
(void*)(&nvrtcGetSupportedArchs_error), | |||||
(void*)(&nvrtcCreateProgram_error), | |||||
(void*)(&nvrtcDestroyProgram_error), | |||||
(void*)(&nvrtcCompileProgram_error), | |||||
(void*)(&nvrtcGetPTXSize_error), | |||||
(void*)(&nvrtcGetPTX_error), | |||||
(void*)(&nvrtcGetCUBINSize_error), | |||||
(void*)(&nvrtcGetCUBIN_error), | |||||
(void*)(&nvrtcGetProgramLogSize_error), | |||||
(void*)(&nvrtcGetProgramLog_error), | |||||
(void*)(&nvrtcAddNameExpression_error), | |||||
(void*)(&nvrtcGetLoweredName_error)}; | |||||
static const char* const g_func_name[NR_FUNC] = {"nvrtcGetErrorString", | |||||
"nvrtcVersion", | |||||
"nvrtcGetNumSupportedArchs", | |||||
"nvrtcGetSupportedArchs", | |||||
"nvrtcCreateProgram", | |||||
"nvrtcDestroyProgram", | |||||
"nvrtcCompileProgram", | |||||
"nvrtcGetPTXSize", | |||||
"nvrtcGetPTX", | |||||
"nvrtcGetCUBINSize", | |||||
"nvrtcGetCUBIN", | |||||
"nvrtcGetProgramLogSize", | |||||
"nvrtcGetProgramLog", | |||||
"nvrtcAddNameExpression", | |||||
"nvrtcGetLoweredName"}; | |||||
static void load_library() { | |||||
static bool done = false; | |||||
static std::mutex mtx; | |||||
std::lock_guard<std::mutex> lg{mtx}; | |||||
if (done) | |||||
return; | |||||
void* handle = get_library_handle(); | |||||
for (size_t i = 0; i < NR_FUNC; ++i) { | |||||
void* func; | |||||
if (!handle) { | |||||
func = nullptr; | |||||
} else { | |||||
func = resolve_library_func(handle, g_func_name[i]); | |||||
} | |||||
if (!func) { | |||||
func = g_func_table_error[i]; | |||||
} | |||||
__atomic_store_n(g_func_table + i, func, __ATOMIC_RELAXED); | |||||
} | |||||
done = true; | |||||
} | |||||
const char _WRAPLIB_API_CALL* nvrtcGetErrorString(nvrtcResult arg0) { | |||||
typedef const char*(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcResult); | |||||
ON_ENTRY(nvrtcGetErrorString); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[0]); | |||||
return f(arg0); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcVersion(int* arg0, int* arg1) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*, int*); | |||||
ON_ENTRY(nvrtcVersion); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[1]); | |||||
return f(arg0, arg1); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs(int* arg0) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*); | |||||
ON_ENTRY(nvrtcGetNumSupportedArchs); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[2]); | |||||
return f(arg0); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs(int* arg0) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*); | |||||
ON_ENTRY(nvrtcGetSupportedArchs); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[3]); | |||||
return f(arg0); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram(nvrtcProgram* arg0, | |||||
const char* arg1, | |||||
const char* arg2, int arg3, | |||||
const char* const* arg4, | |||||
const char* const* arg5) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)( | |||||
nvrtcProgram*, const char*, const char*, int, const char* const*, | |||||
const char* const*); | |||||
ON_ENTRY(nvrtcCreateProgram); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[4]); | |||||
return f(arg0, arg1, arg2, arg3, arg4, arg5); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram(nvrtcProgram* arg0) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram*); | |||||
ON_ENTRY(nvrtcDestroyProgram); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[5]); | |||||
return f(arg0); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcCompileProgram(nvrtcProgram arg0, int arg1, | |||||
const char* const* arg2) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, int, | |||||
const char* const*); | |||||
ON_ENTRY(nvrtcCompileProgram); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[6]); | |||||
return f(arg0, arg1, arg2); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize(nvrtcProgram arg0, size_t* arg1) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||||
ON_ENTRY(nvrtcGetPTXSize); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[7]); | |||||
return f(arg0, arg1); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX(nvrtcProgram arg0, char* arg1) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||||
ON_ENTRY(nvrtcGetPTX); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[8]); | |||||
return f(arg0, arg1); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize(nvrtcProgram arg0, | |||||
size_t* arg1) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||||
ON_ENTRY(nvrtcGetCUBINSize); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[9]); | |||||
return f(arg0, arg1); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN(nvrtcProgram arg0, char* arg1) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||||
ON_ENTRY(nvrtcGetCUBIN); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[10]); | |||||
return f(arg0, arg1); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize(nvrtcProgram arg0, | |||||
size_t* arg1) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||||
ON_ENTRY(nvrtcGetProgramLogSize); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[11]); | |||||
return f(arg0, arg1); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog(nvrtcProgram arg0, | |||||
char* arg1) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||||
ON_ENTRY(nvrtcGetProgramLog); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[12]); | |||||
return f(arg0, arg1); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcAddNameExpression(nvrtcProgram arg0, | |||||
const char* const arg1) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, | |||||
const char* const); | |||||
ON_ENTRY(nvrtcAddNameExpression); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[13]); | |||||
return f(arg0, arg1); | |||||
} | |||||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName(nvrtcProgram arg0, | |||||
const char* const arg1, | |||||
const char** arg2) { | |||||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)( | |||||
nvrtcProgram, const char* const, const char**); | |||||
ON_ENTRY(nvrtcGetLoweredName); | |||||
f_ptr_t f = (f_ptr_t)(g_func_table[14]); | |||||
return f(arg0, arg1, arg2); | |||||
} |
@@ -0,0 +1,75 @@ | |||||
/** | |||||
* \file dnn/cuda-stub/src/libnvrtc.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#pragma GCC visibility push(default) | |||||
#include <cstdio> | |||||
#define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) | |||||
#include "./nvrtc_type.h" | |||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||||
static void log_failed_load(int func_idx); | |||||
namespace { | |||||
template <typename T> | |||||
T on_init_failed(int func_idx); | |||||
template <> | |||||
nvrtcResult on_init_failed(int func_idx) { | |||||
log_failed_load(func_idx); | |||||
return NVRTC_ERROR_INTERNAL_ERROR; | |||||
} | |||||
template <> | |||||
const char* on_init_failed(int func_idx) { | |||||
log_failed_load(func_idx); | |||||
return "load lib failed"; | |||||
} | |||||
} // namespace | |||||
#include "./libnvrtc-wrap.h" | |||||
static const char* default_so_name = | |||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||||
"nvrtc.dll"; | |||||
#elif defined(__APPLE__) || defined(__MACOSX) | |||||
"libnvrtc.dylib"; | |||||
#else | |||||
"libnvrtc.so"; | |||||
#endif | |||||
static const char* default_so_paths[] = { | |||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||||
"nvrtc.dll", | |||||
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || \ | |||||
defined(__MACOSX) | |||||
#if defined(__APPLE__) || defined(__MACOSX) | |||||
"/usr/local/cuda/lib/libnvrtc.dylib", | |||||
#elif defined(__ANDROID__) | |||||
#if defined(__aarch64__) | |||||
"/system/vendor/lib64/libnvrtc.so", | |||||
#elif defined(__arm__) | |||||
"/system/vendor/lib/libnvrtc.so", | |||||
#endif | |||||
#else | |||||
"libnvrtc.so", | |||||
// In case some users does not have correct search path configured in | |||||
// /etc/ld.so.conf | |||||
"/usr/lib/x86_64-linux-gnu/libnvrtc.so", | |||||
"/usr/local/nvidia/lib64/libnvrtc.so", | |||||
"/usr/local/cuda/lib64/libnvrtc.so", | |||||
#endif | |||||
#else | |||||
#error "Unknown platform" | |||||
#endif | |||||
}; | |||||
static const char* extra_so_paths[] = {}; | |||||
static const char* g_default_api_name = "nvrtc"; | |||||
#include "./dlopen_helper.h" |
@@ -0,0 +1,27 @@ | |||||
#pragma once | |||||
#ifdef __cplusplus | |||||
extern "C" { | |||||
#endif /* __cplusplus */ | |||||
#include <stdlib.h> | |||||
typedef enum { | |||||
NVRTC_SUCCESS = 0, | |||||
NVRTC_ERROR_OUT_OF_MEMORY = 1, | |||||
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, | |||||
NVRTC_ERROR_INVALID_INPUT = 3, | |||||
NVRTC_ERROR_INVALID_PROGRAM = 4, | |||||
NVRTC_ERROR_INVALID_OPTION = 5, | |||||
NVRTC_ERROR_COMPILATION = 6, | |||||
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, | |||||
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, | |||||
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, | |||||
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, | |||||
NVRTC_ERROR_INTERNAL_ERROR = 11 | |||||
} nvrtcResult; | |||||
typedef struct _nvrtcProgram *nvrtcProgram; | |||||
#ifdef __cplusplus | |||||
} | |||||
#endif /* __cplusplus */ |
@@ -26,6 +26,7 @@ static inline CUBLASLTMatmulDesc::SizeArgs from_local_size_args( | |||||
return {handle, transA, transB, | return {handle, transA, transB, | ||||
args.layout_a, args.layout_b, args.layout_c}; | args.layout_a, args.layout_b, args.layout_c}; | ||||
} | } | ||||
bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( | bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( | ||||
const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
auto cublasLt_args = from_local_size_args(args); | auto cublasLt_args = from_local_size_args(args); | ||||
@@ -35,6 +36,7 @@ bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( | |||||
.is_available(cublasLt_args, INT_MAX); | .is_available(cublasLt_args, INT_MAX); | ||||
return res; | return res; | ||||
} | } | ||||
size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( | size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( | ||||
const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
auto cublasLt_args = from_local_size_args(args); | auto cublasLt_args = from_local_size_args(args); | ||||
@@ -43,6 +45,7 @@ size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( | |||||
desc.get_algorithm_heuristic(cublasLt_args, INT_MAX, algo); | desc.get_algorithm_heuristic(cublasLt_args, INT_MAX, algo); | ||||
return desc.get_workspace_bundle(cublasLt_args, algo).total_size_in_bytes(); | return desc.get_workspace_bundle(cublasLt_args, algo).total_size_in_bytes(); | ||||
} | } | ||||
void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | ||||
const ExecArgs& args) const { | const ExecArgs& args) const { | ||||
auto cublasLt_args = from_local_size_args(args); | auto cublasLt_args = from_local_size_args(args); | ||||
@@ -89,6 +92,7 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||||
desc.layout_c, &algo, ws_bundle.get(0), | desc.layout_c, &algo, ws_bundle.get(0), | ||||
ws_bundle.get_size(0), stream)); | ws_bundle.get_size(0), stream)); | ||||
}; | }; | ||||
auto batched_igemm = [&]() { | auto batched_igemm = [&]() { | ||||
auto zero = handle->zero_device(); | auto zero = handle->zero_device(); | ||||
auto one = handle->one_device(); | auto one = handle->one_device(); | ||||
@@ -133,6 +137,18 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||||
}; | }; | ||||
ws_bundle.set(args.workspace.raw_ptr); | ws_bundle.set(args.workspace.raw_ptr); | ||||
#if CUDA_VERSION >= 11000 | |||||
if (desc.dt_compute == CUBLAS_COMPUTE_32I) { | |||||
batched_igemm(); | |||||
} else if (desc.dt_compute == CUBLAS_COMPUTE_16F) { | |||||
batched_hgemm(); | |||||
} else if (desc.dt_compute == CUBLAS_COMPUTE_32F) { | |||||
batched_sgemm(); | |||||
} else { | |||||
megdnn_throw( | |||||
megdnn_mangle("compute_type must be int32/float16/float32")); | |||||
} | |||||
#else | |||||
if (desc.dt_compute == CUDA_R_32I) { | if (desc.dt_compute == CUDA_R_32I) { | ||||
batched_igemm(); | batched_igemm(); | ||||
} else if (desc.dt_compute == CUDA_R_16F) { | } else if (desc.dt_compute == CUDA_R_16F) { | ||||
@@ -143,5 +159,6 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||||
megdnn_throw( | megdnn_throw( | ||||
megdnn_mangle("compute_type must be int32/float16/float32")); | megdnn_mangle("compute_type must be int32/float16/float32")); | ||||
} | } | ||||
#endif | |||||
} | } | ||||
#endif | #endif |
@@ -156,6 +156,9 @@ std::string ConvBiasForwardImpl::AlgoBase::SizeArgs::to_string() const { | |||||
case param::ConvBias::NonlineMode::IDENTITY: | case param::ConvBias::NonlineMode::IDENTITY: | ||||
nonlinear_mode_str = "IDENTITY"; | nonlinear_mode_str = "IDENTITY"; | ||||
break; | break; | ||||
case param::ConvBias::NonlineMode::H_SWISH: | |||||
nonlinear_mode_str = "H_SWISH"; | |||||
break; | |||||
default: | default: | ||||
megdnn_throw("invalid conv bias nonlinear mode"); | megdnn_throw("invalid conv bias nonlinear mode"); | ||||
} | } | ||||
@@ -165,6 +165,23 @@ void TensorDesc::set(const TensorLayout& layout, | |||||
} | } | ||||
} | } | ||||
std::string TensorDesc::to_string() { | |||||
cudnnDataType_t data_type; | |||||
int n; | |||||
int c; | |||||
int h; | |||||
int w; | |||||
int n_stride; | |||||
int c_stride; | |||||
int h_stride; | |||||
int w_stride; | |||||
cudnn_check(cudnnGetTensor4dDescriptor(desc, &data_type, &n, &c, &h, &w, | |||||
&n_stride, &c_stride, &h_stride, | |||||
&w_stride)); | |||||
return ssprintf("<dtype_%d, %d,%d,%d,%d(%d,%d,%d,%d)>", data_type, n, c, h, | |||||
w, n_stride, c_stride, h_stride, w_stride); | |||||
} | |||||
template <typename Param> | template <typename Param> | ||||
FilterDesc<Param>::FilterDesc() { | FilterDesc<Param>::FilterDesc() { | ||||
cudnn_check(cudnnCreateFilterDescriptor(&desc)); | cudnn_check(cudnnCreateFilterDescriptor(&desc)); | ||||
@@ -176,6 +193,20 @@ FilterDesc<Param>::~FilterDesc() { | |||||
} | } | ||||
template <typename Param> | template <typename Param> | ||||
std::string FilterDesc<Param>::to_string() { | |||||
cudnnDataType_t data_type; | |||||
cudnnTensorFormat_t format; | |||||
int k; | |||||
int c; | |||||
int h; | |||||
int w; | |||||
cudnn_check(cudnnGetFilter4dDescriptor(desc, &data_type, &format, &k, &c, | |||||
&h, &w)); | |||||
return ssprintf("<dtype_%d, format_%d, %d,%d,%d,%d>", data_type,format, k, c, h, | |||||
w); | |||||
} | |||||
template <typename Param> | |||||
void FilterDesc<Param>::set( | void FilterDesc<Param>::set( | ||||
const typename ConvolutionBase<Param>::CanonizedFilterMeta& | const typename ConvolutionBase<Param>::CanonizedFilterMeta& | ||||
filter_meta) { | filter_meta) { | ||||
@@ -30,6 +30,7 @@ class TensorDesc { | |||||
//! default layout is nchw | //! default layout is nchw | ||||
void set(const TensorLayout& layout, const param::Convolution::Format = | void set(const TensorLayout& layout, const param::Convolution::Format = | ||||
param::Convolution::Format::NCHW); | param::Convolution::Format::NCHW); | ||||
std::string to_string(); | |||||
~TensorDesc(); | ~TensorDesc(); | ||||
cudnnTensorDescriptor_t desc; | cudnnTensorDescriptor_t desc; | ||||
}; | }; | ||||
@@ -39,6 +40,7 @@ class FilterDesc { | |||||
public: | public: | ||||
FilterDesc(); | FilterDesc(); | ||||
void set(const typename ConvolutionBase<Param>::CanonizedFilterMeta &meta); | void set(const typename ConvolutionBase<Param>::CanonizedFilterMeta &meta); | ||||
std::string to_string(); | |||||
~FilterDesc(); | ~FilterDesc(); | ||||
cudnnFilterDescriptor_t desc; | cudnnFilterDescriptor_t desc; | ||||
}; | }; | ||||
@@ -25,6 +25,10 @@ using namespace cuda; | |||||
#define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF | #define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF | ||||
#endif | #endif | ||||
#if CUDA_VERSION < 11000 | |||||
#define CUBLAS_COMPUTE_32I CUDA_R_32I | |||||
#endif | |||||
bool MatrixMulForwardImpl::AlgoCuBlas::is_available( | bool MatrixMulForwardImpl::AlgoCuBlas::is_available( | ||||
const SizeArgs& args) const { | const SizeArgs& args) const { | ||||
if (args.opr->param().format != param::MatrixMul::Format::DEFAULT) | if (args.opr->param().format != param::MatrixMul::Format::DEFAULT) | ||||
@@ -117,7 +121,7 @@ void MatrixMulForwardImpl::AlgoCuBlas::exec(const ExecArgs& args) const { | |||||
args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr, | args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr, | ||||
CUDA_R_8I, args.tensor_a.layout.stride[0], zero, | CUDA_R_8I, args.tensor_a.layout.stride[0], zero, | ||||
args.tensor_c.raw_ptr, CUDA_R_32I, | args.tensor_c.raw_ptr, CUDA_R_32I, | ||||
args.tensor_c.layout.stride[0], CUDA_R_32I, CUBLAS_GEMM_DFALT)); | |||||
args.tensor_c.layout.stride[0], CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DFALT)); | |||||
}; | }; | ||||
// Note that cublas takes column-major matrices as inputs, | // Note that cublas takes column-major matrices as inputs, | ||||
@@ -6,10 +6,11 @@ | |||||
* | * | ||||
* Unless required by applicable law or agreed to in writing, | * Unless required by applicable law or agreed to in writing, | ||||
* software distributed under the License is distributed on an | * software distributed under the License is distributed on an | ||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | */ | ||||
#include "src/cuda/matrix_mul/cublasLt_wrapper.h" | |||||
#include "src/common/utils.h" | #include "src/common/utils.h" | ||||
#include "src/cuda/matrix_mul/cublasLt_wrapper.h" | |||||
#include "src/cuda/utils.h" | #include "src/cuda/utils.h" | ||||
#if CUDA_VERSION >= 10010 | #if CUDA_VERSION >= 10010 | ||||
@@ -33,6 +34,7 @@ static cudaDataType_t to_cuda_dtype(DType tp) { | |||||
} | } | ||||
} | } | ||||
#if CUDA_VERSION >= 11000 | |||||
static cublasComputeType_t to_cublas_compute_type(DType tp) { | static cublasComputeType_t to_cublas_compute_type(DType tp) { | ||||
switch (tp.enumv()) { | switch (tp.enumv()) { | ||||
case DTypeEnum::Float16: | case DTypeEnum::Float16: | ||||
@@ -43,10 +45,11 @@ static cublasComputeType_t to_cublas_compute_type(DType tp) { | |||||
case DTypeEnum::QuantizedS32: | case DTypeEnum::QuantizedS32: | ||||
return CUBLAS_COMPUTE_32I; | return CUBLAS_COMPUTE_32I; | ||||
default: | default: | ||||
megdnn_throw(megdnn_mangle( | |||||
"dtype must be float16/float32/int32/Qs32")); | |||||
megdnn_throw( | |||||
megdnn_mangle("dtype must be float16/float32/int32/Qs32")); | |||||
} | } | ||||
} | } | ||||
#endif | |||||
static const char* cuda_type_to_str(cudaDataType_t tp) { | static const char* cuda_type_to_str(cudaDataType_t tp) { | ||||
switch (tp) { | switch (tp) { | ||||
@@ -106,9 +109,15 @@ void CUBLASLTMatmulDesc::set(const SizeArgs& args, bool batched) { | |||||
dt_b = to_cuda_dtype(args.layout_b.dtype); | dt_b = to_cuda_dtype(args.layout_b.dtype); | ||||
dt_a = to_cuda_dtype(args.layout_a.dtype); | dt_a = to_cuda_dtype(args.layout_a.dtype); | ||||
dt_c = to_cuda_dtype(args.layout_c.dtype); | dt_c = to_cuda_dtype(args.layout_c.dtype); | ||||
dt_compute = to_cublas_compute_type(args.layout_c.dtype); | |||||
megdnn_assert(dt_a == dt_b, "matrix A and B should have same precision"); | megdnn_assert(dt_a == dt_b, "matrix A and B should have same precision"); | ||||
#if CUDA_VERSION >= 11000 | |||||
dt_compute = to_cublas_compute_type(args.layout_c.dtype); | |||||
cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute, dt_c)); | cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute, dt_c)); | ||||
#else | |||||
dt_compute = dt_c; | |||||
cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute)); | |||||
#endif | |||||
cublas_check(cublasLtMatmulDescSetAttribute( | cublas_check(cublasLtMatmulDescSetAttribute( | ||||
matmul_desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pm, sizeof(pm))); | matmul_desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pm, sizeof(pm))); | ||||
@@ -262,8 +271,7 @@ WorkspaceBundle CUBLASLTMatmulDesc::get_workspace_bundle( | |||||
dt_c == CUDA_R_32I ? layout_trans_b : layout_b, | dt_c == CUDA_R_32I ? layout_trans_b : layout_b, | ||||
dt_c == CUDA_R_32I ? layout_trans_a : layout_a, | dt_c == CUDA_R_32I ? layout_trans_a : layout_a, | ||||
dt_c == CUDA_R_32I ? layout_trans_c : layout_c, | dt_c == CUDA_R_32I ? layout_trans_c : layout_c, | ||||
dt_c == CUDA_R_32I ? layout_trans_c : layout_c, &algo, | |||||
&result); | |||||
dt_c == CUDA_R_32I ? layout_trans_c : layout_c, &algo, &result); | |||||
// return empty WorkspaceBundle if cublasLtMatmulAlgoCheck() failed | // return empty WorkspaceBundle if cublasLtMatmulAlgoCheck() failed | ||||
if (status != CUBLAS_STATUS_SUCCESS) | if (status != CUBLAS_STATUS_SUCCESS) | ||||
return {nullptr, {}}; | return {nullptr, {}}; | ||||
@@ -48,7 +48,11 @@ struct CUBLASLTMatmulDesc { | |||||
bool is_batched; | bool is_batched; | ||||
cublasLtMatmulDesc_t matmul_desc; | cublasLtMatmulDesc_t matmul_desc; | ||||
cudaDataType_t dt_a, dt_b, dt_c; | cudaDataType_t dt_a, dt_b, dt_c; | ||||
#if CUDA_VERSION >= 11000 | |||||
cublasComputeType_t dt_compute; | cublasComputeType_t dt_compute; | ||||
#else | |||||
cudaDataType_t dt_compute; | |||||
#endif | |||||
cublasLtMatrixLayout_t layout_a, layout_b, layout_c; | cublasLtMatrixLayout_t layout_a, layout_b, layout_c; | ||||
cublasLtMatrixLayout_t layout_trans_a, layout_trans_b, layout_trans_c; | cublasLtMatrixLayout_t layout_trans_a, layout_trans_b, layout_trans_c; | ||||
size_t workspace_a, workspace_b, workspace_c; | size_t workspace_a, workspace_b, workspace_c; | ||||
@@ -128,7 +128,23 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { | |||||
stream)); | stream)); | ||||
cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc)); | cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc)); | ||||
}; | }; | ||||
switch(desc.dt_compute) { | |||||
#if CUDA_VERSION >= 11000 | |||||
switch (desc.dt_compute) { | |||||
case CUBLAS_COMPUTE_16F: | |||||
hgemm(); | |||||
break; | |||||
case CUBLAS_COMPUTE_32F: | |||||
sgemm(); | |||||
break; | |||||
case CUBLAS_COMPUTE_32I: | |||||
igemm(); | |||||
break; | |||||
default: | |||||
megdnn_throw(megdnn_mangle( | |||||
"compute type must be float16/float32/int32")); | |||||
} | |||||
#else | |||||
switch (desc.dt_compute) { | |||||
case CUDA_R_16F: | case CUDA_R_16F: | ||||
hgemm(); | hgemm(); | ||||
break; | break; | ||||
@@ -139,8 +155,10 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { | |||||
igemm(); | igemm(); | ||||
break; | break; | ||||
default: | default: | ||||
megdnn_throw(megdnn_mangle("compute type must be float16/float32/int32")); | |||||
megdnn_throw(megdnn_mangle( | |||||
"compute type must be float16/float32/int32")); | |||||
} | } | ||||
#endif | |||||
} | } | ||||
#endif | #endif | ||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -309,6 +309,9 @@ void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args, | |||||
arg.f / (1e12); | arg.f / (1e12); | ||||
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | ||||
filter{arg.co, arg.ci, arg.f, arg.f}; | filter{arg.co, arg.ci, arg.f, arg.f}; | ||||
if (!algo){ | |||||
algo = "no_name"; | |||||
} | |||||
printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, " | printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, " | ||||
"time(cudnn)=%.2f %.2fTops, time(batched_matmul)=%.2f " | "time(cudnn)=%.2f %.2fTops, time(batched_matmul)=%.2f " | ||||
"%.2fTops, " | "%.2fTops, " | ||||
@@ -1,2 +1,3 @@ | |||||
Makefile | Makefile | ||||
/test/imperative_test | /test/imperative_test | ||||
python/megengine/version.py |
@@ -70,3 +70,8 @@ add_custom_command( | |||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt | COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt | ||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt | COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt | ||||
) | ) | ||||
add_custom_command( | |||||
TARGET ${MODULE_NAME} POST_BUILD | |||||
COMMAND "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/python/gen_version.py --output ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/version.py | |||||
) |
@@ -0,0 +1,31 @@ | |||||
import argparse | |||||
import os | |||||
import subprocess | |||||
def get_git_commit(src_dir): | |||||
try: | |||||
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=src_dir).decode('ascii').strip() | |||||
except Exception: | |||||
return 'unknown' | |||||
def get_mge_version(version_txt_path): | |||||
v = {} | |||||
with open(version_txt_path) as fp: | |||||
exec(fp.read(), v) | |||||
return v | |||||
if __name__ == "__main__": | |||||
parser = argparse.ArgumentParser(description="generate version.py to build path") | |||||
parser.add_argument("--output", type=str, required=True) | |||||
args = parser.parse_args() | |||||
python_dir = os.path.dirname(__file__) | |||||
version_txt_path = os.path.join(python_dir, 'version_template.py') | |||||
commit_id = get_git_commit(python_dir) | |||||
mge_ver_map = get_mge_version(version_txt_path) | |||||
mge_ver = mge_ver_map['__version__'] if '__version__' in mge_ver_map else 'unknown' | |||||
mge_intl = mge_ver_map['__internal__'] if '__internal__' in mge_ver_map else False | |||||
with open(args.output, 'w') as f: | |||||
f.write("__version__ = '{}'\n".format(mge_ver)) | |||||
f.write("git_version = {}\n".format(repr(commit_id))) | |||||
if mge_intl: | |||||
f.write("__internal__ = True\n") |
@@ -0,0 +1,10 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
# | |||||
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||||
# | |||||
# Unless required by applicable law or agreed to in writing, | |||||
# software distributed under the License is distributed on an | |||||
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
__version__ = "1.3.0.dev" | |||||
@@ -8,7 +8,7 @@ | |||||
```bash | ```bash | ||||
1: please refer to: https://docs.docker.com/engine/security/rootless/ to enable rootless docker env | 1: please refer to: https://docs.docker.com/engine/security/rootless/ to enable rootless docker env | ||||
2: cd ./scripts/whl/manylinux2010 | |||||
2: cd ./scripts/whl/manylinux2014 | |||||
3: ./build_image.sh | 3: ./build_image.sh | ||||
``` | ``` | ||||
@@ -56,24 +56,25 @@ | |||||
``` | ``` | ||||
# How to build | # How to build | ||||
Note: Guarantee the git repo is mounted in docker container, do not use `git submodule update --init` in to init megbrain repo | |||||
## Build for linux | ## Build for linux | ||||
* MegBrain delivers `wheel` package with `manylinux2010` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/). | |||||
* MegBrain delivers `wheel` package with `manylinux2014` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/). | |||||
commands: | commands: | ||||
```bash | ```bash | ||||
export CUDA_ROOT_DIR=/path/to/cuda | export CUDA_ROOT_DIR=/path/to/cuda | ||||
export CUDNN_ROOT_DIR=/path/to/cudnn | export CUDNN_ROOT_DIR=/path/to/cudnn | ||||
export TENSORRT_ROOT_DIR=/path/to/tensorrt | export TENSORRT_ROOT_DIR=/path/to/tensorrt | ||||
./scripts/whl/manylinux2010/build_wheel.sh | |||||
./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||||
``` | ``` | ||||
* And you can find all of the outputs in `output` directory.If you just want to build for a specific Python verison, you can use `ALL_PYTHON` environment variable. eg: | * And you can find all of the outputs in `output` directory.If you just want to build for a specific Python verison, you can use `ALL_PYTHON` environment variable. eg: | ||||
```bash | ```bash | ||||
ALL_PYTHON="36m" ./scripts/whl/manylinux2010/build_wheel.sh | |||||
ALL_PYTHON="36m" ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||||
``` | ``` | ||||
* If you just want to build with cpu only version, you can set `BUILD_WHL_CPU_ONLY` environment 'ON'. eg: | * If you just want to build with cpu only version, you can set `BUILD_WHL_CPU_ONLY` environment 'ON'. eg: | ||||
```bash | ```bash | ||||
BUILD_WHL_CPU_ONLY="ON" ALL_PYTHON="36m" ./scripts/whl/manylinux2010/build_wheel.sh | |||||
BUILD_WHL_CPU_ONLY="ON" ALL_PYTHON="36m" ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||||
``` | ``` | ||||
## Build for MacOS | ## Build for MacOS | ||||
@@ -0,0 +1,15 @@ | |||||
FROM quay.io/pypa/manylinux2014_x86_64:2020-12-31-56195b3 | |||||
ENV UID=1024 \ | |||||
PATH=${PATH}:/usr/local/cuda/bin \ | |||||
LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \ | |||||
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \ | |||||
CPATH=${CPATH}:/usr/local/cuda/include:/opt/cudnn/include:/opt/tensorrt/include | |||||
ARG platform | |||||
COPY setup_mirror.sh . | |||||
RUN ./setup_mirror.sh "$platform" | |||||
ADD init_image.sh /tmp | |||||
RUN /tmp/init_image.sh && rm -f /tmp/init_image.sh | |||||
@@ -0,0 +1,5 @@ | |||||
#!/bin/bash -e | |||||
cd $(dirname $0) | |||||
docker build -t env_manylinux2014:latest . |
@@ -0,0 +1,217 @@ | |||||
#!/bin/bash | |||||
set -e | |||||
CWD=$(dirname $0) | |||||
BASEDIR=$(readlink -f ${CWD}/../../..) | |||||
OUTPUTDIR=$(readlink -f ${CWD}/output) | |||||
USERID=$(id -u) | |||||
TMPFS_ARGS="--tmpfs /tmp:exec" | |||||
local_path=$(dirname $(readlink -f $0)) | |||||
CUDNN_LIB_DIR="/opt/cudnn/lib64/" | |||||
CUDA_LIB_DIR="/usr/local/cuda/lib64/" | |||||
CUDA_SDK="unknown" | |||||
function usage() { | |||||
echo "use '-sdk cu111' to specify cuda toolkit config, also support cu101, cu112" | |||||
} | |||||
while [ "$1" != "" ]; do | |||||
case $1 in | |||||
-sdk) | |||||
shift | |||||
CUDA_SDK=$1 | |||||
shift | |||||
;; | |||||
*) | |||||
usage | |||||
exit 1 | |||||
esac | |||||
done | |||||
echo "Build with ${CUDA_SDK}" | |||||
if [ $CUDA_SDK == "cu101" ];then | |||||
COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | |||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" | |||||
OUT_DIR="cu101" | |||||
BUILD_GCC8="ON" | |||||
REQUIR_CUDA_VERSION="10010" | |||||
REQUIR_CUDNN_VERSION="7.6.3" | |||||
REQUIR_TENSORRT_VERSION="6.0.1.5" | |||||
elif [ $CUDA_SDK == "cu111" ];then | |||||
COPY_LIB_LIST="\ | |||||
${CUDA_LIB_DIR}/libnvrtc.so.11.1:\ | |||||
${CUDA_LIB_DIR}/libcublasLt.so.11:\ | |||||
${CUDA_LIB_DIR}/libcublas.so.11:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_adv_infer.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_adv_train.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_cnn_infer.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_cnn_train.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn.so.8" | |||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\ | |||||
-gencode arch=compute_61,code=sm_61 \ | |||||
arch=compute_70,code=sm_70 \ | |||||
arch=compute_75,code=sm_75 \ | |||||
arch=compute_80,code=sm_80 \ | |||||
arch=compute_86,code=sm_86 \ | |||||
arch=compute_86,code=compute_86" | |||||
OUT_DIR="cu111" | |||||
REQUIR_CUDA_VERSION="11010" | |||||
REQUIR_CUDNN_VERSION="8.0.5" | |||||
REQUIR_TENSORRT_VERSION="7.2.2.3" | |||||
elif [ $CUDA_SDK == "cu112" ];then | |||||
COPY_LIB_LIST="\ | |||||
${CUDA_LIB_DIR}/libnvrtc.so.11.2:\ | |||||
${CUDA_LIB_DIR}/libcublasLt.so.11:\ | |||||
${CUDA_LIB_DIR}/libcublas.so.11:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_adv_infer.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_adv_train.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_cnn_infer.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_cnn_train.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||||
${CUDNN_LIB_DIR}/libcudnn.so.8" | |||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \ | |||||
-gencode arch=compute_61,code=sm_61 \ | |||||
arch=compute_70,code=sm_70 \ | |||||
arch=compute_75,code=sm_75 \ | |||||
arch=compute_80,code=sm_80 \ | |||||
arch=compute_86,code=sm_86 \ | |||||
arch=compute_86,code=compute_86" | |||||
OUT_DIR="cu112" | |||||
REQUIR_CUDA_VERSION="11020" | |||||
REQUIR_CUDNN_VERSION="8.0.5" | |||||
REQUIR_TENSORRT_VERSION="7.2.2.3" | |||||
else | |||||
echo "no support sdk ${CUDA_SDK}, please set by '-sdk cu111'" | |||||
exit -1 | |||||
fi | |||||
BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} | |||||
if [[ -z ${BUILD_WHL_CPU_ONLY} ]] | |||||
then | |||||
BUILD_WHL_CPU_ONLY="OFF" | |||||
fi | |||||
echo ${BASEDIR} | |||||
pushd ${BASEDIR}/third_party >/dev/null | |||||
./prepare.sh | |||||
popd >/dev/null | |||||
cd ${CWD} | |||||
mkdir -p ${OUTPUTDIR} | |||||
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||||
if [[ -z ${CUDA_ROOT_DIR} ]]; then | |||||
echo "Environment variable CUDA_ROOT_DIR not set." | |||||
exit -1 | |||||
fi | |||||
if [[ -z ${CUDNN_ROOT_DIR} ]]; then | |||||
echo "Environment variable CUDNN_ROOT_DIR not set." | |||||
exit -1 | |||||
fi | |||||
if [[ -z ${TENSORRT_ROOT_DIR} ]]; then | |||||
echo "Environment variable TENSORRT_ROOT_DIR not set." | |||||
exit -1 | |||||
fi | |||||
## YOU SHOULD MODIFY CUDA VERSION AS BELOW WHEN UPGRADE | |||||
CUDA_ROOT_DIR_=${CUDA_ROOT_DIR%*/} | |||||
CUDNN_ROOT_DIR_=${CUDNN_ROOT_DIR%*/} | |||||
TENSORRT_ROOT_DIR_=${TENSORRT_ROOT_DIR%*/} | |||||
CUDA_VERSION_PATH=${CUDA_ROOT_DIR_}/include/cuda.h | |||||
if [ "$REQUIR_CUDA_VERSION" -ge "11000" ];then | |||||
CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn_version.h | |||||
else | |||||
CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn.h | |||||
fi | |||||
TENSORRT_VERSION_PATH=${TENSORRT_ROOT_DIR_}/include/NvInferVersion.h | |||||
if [ ! -e $CUDA_VERSION_PATH ] ; then | |||||
echo file $CUDA_VERSION_PATH is not exist | |||||
echo please check the Environment must use CUDA-$REQUIR_CUDA_VERSION | |||||
exit -1 | |||||
fi | |||||
if [ ! -e $CUDNN_VERSION_PATH ] ; then | |||||
echo file $CUDNN_VERSION_PATH is not exist | |||||
echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION | |||||
exit -1 | |||||
fi | |||||
if [ ! -e $TENSORRT_VERSION_PATH ] ; then | |||||
echo file $TENSORRT_VERSION_PATH is not exist | |||||
echo please check the Environment must use TensorRT-$REQUIR_TENSORRT_VERSION | |||||
exit -1 | |||||
fi | |||||
CUDA_VERSION_CONTEXT=$(head -300 ${CUDA_VERSION_PATH}) | |||||
CUDNN_VERSION_CONTEXT=$(head -62 ${CUDNN_VERSION_PATH}) | |||||
TENSORRT_VERSION_CONTEXT=$(tail -12 ${TENSORRT_VERSION_PATH}) | |||||
if [ "$REQUIR_CUDA_VERSION" -ge "11000" ];then | |||||
CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define CUDA_VERSION * +([0-9]+)") | |||||
else | |||||
CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define __CUDA_API_VERSION * +([0-9]+)") | |||||
fi | |||||
CUDA_VERSION=${CUDA_API_VERSION:0-5} | |||||
echo CUDA_VERSION:$CUDA_VERSION | |||||
CUDNN_VERSION_MAJOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MAJOR * +([0-9]+)") | |||||
CUDNN_VERSION_MINOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MINOR * +([0-9]+)") | |||||
CUDNN_VERSION_PATCH=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_PATCHLEVEL * +([0-9]+)") | |||||
CUDNN_VERSION=${CUDNN_VERSION_MAJOR:0-1}.${CUDNN_VERSION_MINOR:0-1}.${CUDNN_VERSION_PATCH:0-1} | |||||
echo CUDNN_VERSION:$CUDNN_VERSION | |||||
TENSORRT_VERSION_MAJOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MAJOR * +([0-9]+)") | |||||
TENSORRT_VERSION_MINOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MINOR * +([0-9]+)") | |||||
TENSORRT_VERSION_PATCH=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_PATCH * +([0-9]+)") | |||||
TENSORRT_VERSION_BUILD=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_BUILD * +([0-9]+)") | |||||
TENSORRT_VERSION=${TENSORRT_VERSION_MAJOR:0-1}.${TENSORRT_VERSION_MINOR:0-1}.${TENSORRT_VERSION_PATCH:0-1}.${TENSORRT_VERSION_BUILD:0-1} | |||||
echo TENSORRT_VERSION:$TENSORRT_VERSION | |||||
if [ $CUDA_VERSION != $REQUIR_CUDA_VERSION ] ; then | |||||
echo please check the Environment must use CUDA-10.1 NO.$REQUIR_CUDA_VERSION | |||||
exit -1 | |||||
fi | |||||
if [ $CUDNN_VERSION != $REQUIR_CUDNN_VERSION ] ; then | |||||
echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION | |||||
exit -1 | |||||
fi | |||||
if [ $TENSORRT_VERSION != $REQUIR_TENSORRT_VERSION ] ; then | |||||
echo please check the Environment must use TENSORRT-$REQUIR_TENSORRT_VERSION | |||||
exit -1 | |||||
fi | |||||
fi | |||||
if [[ -z ${BUILD_GCC8} ]];then | |||||
BUILD_GCC8=OFF | |||||
fi | |||||
if [ "$BUILD_GCC8" == "ON" ];then | |||||
run_cmd="scl enable devtoolset-8 /home/code/scripts/whl/manylinux2014/do_build_common.sh" | |||||
else | |||||
run_cmd="/home/code/scripts/whl/manylinux2014/do_build_common.sh" | |||||
fi | |||||
docker run --rm -it $TMPFS_ARGS \ | |||||
-e UID=${USERID} \ | |||||
-e LOCAL_VERSION=${LOCAL_VERSION} \ | |||||
-e BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} \ | |||||
-e ALL_PYTHON="${ALL_PYTHON}" \ | |||||
-e EXTRA_CMAKE_FLAG="$EXTRA_CMAKE_FLAG" \ | |||||
-e COPY_LIB_LIST="$COPY_LIB_LIST" \ | |||||
-e OUT_DIR="$OUT_DIR" \ | |||||
-v ${CUDA_ROOT_DIR}:/usr/local/cuda \ | |||||
-v ${CUDNN_ROOT_DIR}:/opt/cudnn \ | |||||
-v ${TENSORRT_ROOT_DIR}:/opt/tensorrt \ | |||||
-v ${BASEDIR}:/home/code \ | |||||
-v ${OUTPUTDIR}:/home/output:rw \ | |||||
env_manylinux2014:latest /bin/bash -c "$run_cmd" | |||||
@@ -0,0 +1,136 @@ | |||||
#!/bin/bash -ex | |||||
function handle_strip() { | |||||
echo "now handle strip $1" | |||||
objcopy --only-keep-debug $1 $1.dbg | |||||
strip -s $1 | |||||
objcopy --add-gnu-debuglink=$1.dbg $1 | |||||
rm $1.dbg | |||||
} | |||||
function full_copy_so(){ | |||||
lib_path=$1 | |||||
dst_dir=$2 | |||||
append_rpath=$3 | |||||
lib_name=$(basename $lib_path) | |||||
cp $lib_path $dst_dir/$lib_name | |||||
if [ "$append_rpath" != "" ];then | |||||
ori_rpath=$(patchelf --print-rpath $dst_dir/$lib_name) | |||||
if [ "$ori_rpath" != "" ];then | |||||
patchelf --set-rpath "$ori_rpath:$append_rpath" $dst_dir/$lib_name | |||||
else | |||||
patchelf --set-rpath "$append_rpath" $dst_dir/$lib_name | |||||
fi | |||||
fi | |||||
} | |||||
function patch_elf_depend_lib() { | |||||
echo "handle common depend lib" | |||||
LIBS_DIR=${BUILD_DIR}/staging/megengine/core/lib | |||||
mkdir -p ${LIBS_DIR} | |||||
cp /usr/lib64/libatomic.so.1 ${LIBS_DIR} | |||||
patchelf --remove-rpath ${BUILD_DIR}/staging/megengine/core/_imperative_rt.so | |||||
patchelf --force-rpath --set-rpath '$ORIGIN/lib' ${BUILD_DIR}/staging/megengine/core/_imperative_rt.so | |||||
cp ${BUILD_DIR}/src/libmegengine_export.so ${LIBS_DIR} | |||||
patchelf --remove-rpath ${LIBS_DIR}/libmegengine_export.so | |||||
patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so | |||||
cp ${BUILD_DIR}/src/libmegengine_export.so ${LIBS_DIR} | |||||
patchelf --remove-rpath ${LIBS_DIR}/libmegengine_export.so | |||||
patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so | |||||
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||||
echo "handle cuda lib" | |||||
cp ${BUILD_DIR}/dnn/cuda-stub/libcuda_stub.so ${LIBS_DIR} | |||||
cp /usr/local/cuda/lib64/libnvToolsExt.so.1 ${LIBS_DIR} | |||||
IFS=: read -a lib_name_array <<<"$COPY_LIB_LIST" | |||||
append_rpath='$ORIGIN/.' | |||||
for lib_name in ${lib_name_array[@]};do | |||||
full_copy_so $lib_name ${LIBS_DIR} $lib_append_rpath | |||||
done | |||||
fi | |||||
} | |||||
ALL_PYTHON=${ALL_PYTHON} | |||||
if [[ -z ${ALL_PYTHON} ]] | |||||
then | |||||
ALL_PYTHON="35m 36m 37m 38" | |||||
fi | |||||
BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} | |||||
if [[ -z ${BUILD_WHL_CPU_ONLY} ]] | |||||
then | |||||
BUILD_WHL_CPU_ONLY="OFF" | |||||
fi | |||||
SRC_DIR=$(readlink -f "`dirname $0`/../../../") | |||||
BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/ | |||||
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||||
BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_ON/MGE_INFERENCE_ONLY_OFF/Release/build/ | |||||
fi | |||||
NEW_LIB_PATH=core/lib | |||||
for ver in ${ALL_PYTHON} | |||||
do | |||||
USE_AUDITWHEEL="ON" | |||||
python_ver=${ver:0:2} | |||||
MAJOR=${python_ver:0:1} | |||||
MINOR=${ver:1} | |||||
PYTHON_DIR=/opt/python/cp${python_ver}-cp${ver}/ | |||||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} ${EXTRA_CMAKE_FLAG}" | |||||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo" | |||||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_PREFIX_PATH=${PYTHON_DIR}" | |||||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_EXECUTABLE=${PYTHON_DIR}/bin/python3" | |||||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_LIBRARY=${PYTHON_DIR}lib/" | |||||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python${MAJOR}.${MINOR}" | |||||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DMGE_WITH_ATLAS=ON" | |||||
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||||
${SRC_DIR}/scripts/cmake-build/host_build.sh -c -t -r | |||||
else | |||||
${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r | |||||
fi | |||||
cd ${BUILD_DIR} | |||||
rm -rf staging | |||||
mkdir -p staging | |||||
cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/ | |||||
handle_strip ${BUILD_DIR}/src/libmegengine_export.so | |||||
cd ${BUILD_DIR}/staging/megengine/core | |||||
handle_strip _imperative_rt.so | |||||
mkdir -p lib/ucx | |||||
if [ ${USE_AUDITWHEEL} = "OFF" ]; then | |||||
patch_elf_depend_lib | |||||
fi | |||||
cd ${BUILD_DIR}/staging/ | |||||
${PYTHON_DIR}/bin/python setup.py bdist_wheel | |||||
cd /home/output | |||||
if [ ${USE_AUDITWHEEL} = "ON" ]; then | |||||
LD_LIBRARY_PATH=${BUILD_DIR}/dnn/cuda-stub:$LD_LIBRARY_PATH auditwheel repair -L ${NEW_LIB_PATH} ${BUILD_DIR}/staging/dist/Meg*.whl | |||||
else | |||||
mkdir -p ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${OUT_DIR} | |||||
cd ${BUILD_DIR}/staging/dist/ | |||||
org_whl_name=`ls Meg*${ver}*.whl` | |||||
compat_whl_name=`echo ${org_whl_name} | sed 's/linux/manylinux2014/'` | |||||
echo "org whl name: ${org_whl_name}" | |||||
echo "comapt whl name: ${compat_whl_name}" | |||||
mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${OUT_DIR}/${compat_whl_name} | |||||
cd /home/output | |||||
fi | |||||
chown -R ${UID}.${UID} . | |||||
# compat for root-less docker env to remove output at host side | |||||
chmod -R 777 . | |||||
echo "python $ver done" | |||||
done | |||||
@@ -0,0 +1,70 @@ | |||||
#!/bin/bash -e | |||||
GET_PIP_URL='https://bootstrap.pypa.io/get-pip.py' | |||||
SWIG_URL='https://downloads.sourceforge.net/project/swig/swig/swig-3.0.12/swig-3.0.12.tar.gz?use_mirror=autoselect' | |||||
LLVM_URL='https://github.com/llvm-mirror/llvm/archive/release_60.tar.gz' | |||||
CLANG_URL='https://github.com/llvm-mirror/clang/archive/release_60.tar.gz' | |||||
yum install -y pcre-devel devtoolset-9-libatomic-devel.x86_64 | |||||
yum install -y devtoolset-8 devtoolset-8-libatomic-devel.x86_64 | |||||
for ver in 35m 36m 37m 38 | |||||
do | |||||
python_ver=${ver:0:2} | |||||
curl ${GET_PIP_URL} | /opt/python/cp${python_ver}-cp${ver}/bin/python - \ | |||||
--no-cache-dir --only-binary :all: | |||||
/opt/python/cp${python_ver}-cp${ver}/bin/pip install \ | |||||
--no-cache-dir --only-binary :all: numpy==1.18.1 setuptools==46.1.3 | |||||
done | |||||
pushd /home >/dev/null | |||||
echo "Install swig" | |||||
curl -sSL ${SWIG_URL} | tar xz | |||||
pushd swig-3.0.12 >/dev/null | |||||
mkdir build | |||||
pushd build >/dev/null | |||||
../configure | |||||
make -j$(nproc) | |||||
make install | |||||
popd >/dev/null | |||||
popd >/dev/null | |||||
rm -rf swig-3.0.12 | |||||
echo "Install llvm" | |||||
curl -sSL ${LLVM_URL} | tar xz | |||||
pushd llvm-release_60 >/dev/null | |||||
mkdir build | |||||
pushd build >/dev/null | |||||
cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \ | |||||
-DCMAKE_BUILD_TYPE=Release | |||||
make -j$(nproc) | |||||
make install | |||||
popd >/dev/null | |||||
popd >/dev/null | |||||
rm -rf llvm-release_60 | |||||
echo "Install clang" | |||||
curl -sSL ${CLANG_URL} | tar xz | |||||
pushd clang-release_60 >/dev/null | |||||
mkdir build | |||||
pushd build >/dev/null | |||||
cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \ | |||||
-DCMAKE_BUILD_TYPE=Release | |||||
make -j$(nproc) | |||||
make install | |||||
popd >/dev/null | |||||
popd >/dev/null | |||||
rm -rf clang-release_60 | |||||
popd >/dev/null | |||||
pushd /tmp >/dev/null | |||||
curl -sSL https://github.com/NixOS/patchelf/archive/0.12.tar.gz | tar xz | |||||
pushd /tmp/patchelf-0.12 >/dev/null | |||||
sed -i '331s/32/64/' ./src/patchelf.cc | |||||
./bootstrap.sh && ./configure && make install-strip | |||||
popd | |||||
rm -rf /tmp/patchelf-0.12 | |||||
popd | |||||
yum clean all |
@@ -0,0 +1,65 @@ | |||||
#!/bin/bash | |||||
set -e | |||||
function set_tuna_yum_mirror() { | |||||
cp /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.bak | |||||
local repo=/etc/yum.repos.d/CentOS-Base.repo | |||||
local plugin=/etc/yum/pluginconf.d/fastestmirror.conf | |||||
sed -i "s/mirrorlist=/#mirrorlist=/g" $repo | |||||
sed -i "s/#baseurl/baseurl/g" $repo | |||||
sed -i "s/mirror.centos.org/mirrors.tuna.tsinghua.edu.cn/g" $repo | |||||
sed -i "s/http/https/g" $repo | |||||
sed -i "s/enabled=1/enabled=0/g" $plugin | |||||
yum clean all | |||||
# Build on brainpp unable to pull epel reo metadata so disable this | |||||
# https://unix.stackexchange.com/questions/148144/unable-to-pull-epel-repository-metadata | |||||
yum --disablerepo="epel" update nss | |||||
yum makecache | |||||
} | |||||
function set_epel() { | |||||
mv /etc/yum.repos.d/epel.repo /etc/yum.repos.d/epel.repo.backup | |||||
mv /etc/yum.repos.d/epel-testing.repo /etc/yum.repos.d/epel-testing.repo.backup | |||||
curl -o /etc/yum.repos.d/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo | |||||
} | |||||
function set_yum_mirror() { | |||||
mv /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.backup | |||||
curl -o /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-7.repo | |||||
yum makecache | |||||
} | |||||
function set_pip_mirror() { | |||||
cat > /etc/pip.conf <<EOF | |||||
[global] | |||||
timeout = 180 | |||||
index-url = https://mirrors.aliyun.com/pypi/simple | |||||
extra-index-url = | |||||
http://mirrors.i.brainpp.cn/pypi/simple/ | |||||
http://pypi.i.brainpp.cn/brain/dev/+simple | |||||
https://pypi.tuna.tsinghua.edu.cn/simple | |||||
trusted-host = | |||||
mirrors.i.brainpp.cn | |||||
pypi.i.brainpp.cn | |||||
pypi.tuna.tsinghua.edu.cn | |||||
mirrors.aliyun.com | |||||
EOF | |||||
} | |||||
function main() { | |||||
local platform=$1 | |||||
case $platform in | |||||
brainpp) | |||||
set_epel | |||||
set_yum_mirror | |||||
set_pip_mirror | |||||
;; | |||||
*) | |||||
echo "No setup required" | |||||
;; | |||||
esac | |||||
} | |||||
main "$@" |
@@ -81,8 +81,9 @@ void NMSKeep::CUDAKern::exec(const NMSKeep* opr, const DeviceTensorND& inp, | |||||
init(opr, inp.shape()); | init(opr, inp.shape()); | ||||
auto inp_ptr = inp.ptr<float>(); | auto inp_ptr = inp.ptr<float>(); | ||||
auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace.raw_ptr()), | |||||
dev_rm_mask = reinterpret_cast<uint64_t*>( | |||||
void* workspace_ptr = workspace.raw_ptr(); | |||||
auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace_ptr), | |||||
dev_rm_mask = (uint64_t*)( | |||||
workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align); | workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align); | ||||
auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()), | auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()), | ||||
out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>()); | out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>()); | ||||
@@ -27,6 +27,9 @@ | |||||
#include "megbrain/gopt/inference.h" | #include "megbrain/gopt/inference.h" | ||||
#include "megbrain/gopt/misc.h" | #include "megbrain/gopt/misc.h" | ||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||||
using namespace mgb; | using namespace mgb; | ||||
using namespace gopt; | using namespace gopt; | ||||
using namespace cg; | using namespace cg; | ||||
@@ -1749,6 +1752,7 @@ void mgb::tensorrt::transform_dest_vars_inplace( | |||||
optimizer.apply_inplace(dest_vars); | optimizer.apply_inplace(dest_vars); | ||||
} | } | ||||
#pragma GCC diagnostic pop | |||||
#endif | #endif | ||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -20,7 +20,8 @@ | |||||
#include "megbrain/utils/debug.h" | #include "megbrain/utils/debug.h" | ||||
#if MGB_ENABLE_TENSOR_RT | #if MGB_ENABLE_TENSOR_RT | ||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||||
#include "megbrain/tensorrt/tensorrt_opr.h" | #include "megbrain/tensorrt/tensorrt_opr.h" | ||||
#include "make_trt_net.h" | #include "make_trt_net.h" | ||||
@@ -111,7 +112,8 @@ intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() { | |||||
host_b = range_gen({1, 8, 1, 1}); | host_b = range_gen({1, 8, 1, 1}); | ||||
{ | { | ||||
float* ptr = reinterpret_cast<float*>(host_w->raw_ptr()); | |||||
void* w_ptr = host_w->raw_ptr(); | |||||
float* ptr = reinterpret_cast<float*>(w_ptr); | |||||
ptr[0] = -127*1.1f; | ptr[0] = -127*1.1f; | ||||
ptr[1] = 127*1.1f; | ptr[1] = 127*1.1f; | ||||
} | } | ||||
@@ -362,6 +364,7 @@ intl::ConcatConvTensorRTNetwork::create_trt_network(bool has_batch_dim) { | |||||
return std::make_pair(builder, network); | return std::make_pair(builder, network); | ||||
} | } | ||||
#pragma GCC diagnostic pop | |||||
#endif // MGB_ENABLE_TENSOR_RT | #endif // MGB_ENABLE_TENSOR_RT | ||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |