@@ -39,6 +39,9 @@ option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF) | |||
option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON) | |||
option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | |||
option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | |||
option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) | |||
option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) | |||
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF) | |||
option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | |||
option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | |||
option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | |||
@@ -55,6 +58,14 @@ option(MGE_BUILD_SDK "Build load_and_run" ON) | |||
option(MGE_INFERENCE_ONLY "Build inference only library." OFF) | |||
option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON) | |||
option(MGE_WITH_ROCM "Enable ROCM support" OFF) | |||
option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) | |||
if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) | |||
set(MGE_WITH_ANY_CUDA_STUB ON) | |||
else() | |||
set(MGE_WITH_ANY_CUDA_STUB OFF) | |||
endif() | |||
if(NOT ${MGE_BIN_REDUCE} STREQUAL "") | |||
message(STATUS "build with BIN REDUCE") | |||
@@ -205,14 +216,24 @@ else() | |||
endif() | |||
endif() | |||
if(MGE_WITH_CUDA) | |||
include(cmake/cudnn.cmake) | |||
if(MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||
message(WARNING "Static link CUDNN8 will auto enable MGE_WITH_LARGE_ARCHIVE=ON") | |||
set(MGE_WITH_LARGE_ARCHIVE ON) | |||
endif() | |||
endif() | |||
CHECK_CXX_COMPILER_FLAG(-fuse-ld=gold CXX_SUPPORT_GOLD) | |||
if(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32) | |||
if(MGE_WITH_LARGE_ARCHIVE) | |||
message(STATUS "Set -mcmodel=large and disable -fuse-ld=gold") | |||
set(MGE_COMMON_LINKER_FLAGS "-mcmodel=large") | |||
elseif(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32 AND NOT MGE_WITH_LARGE_ARCHIVE) | |||
message(STATUS "Using GNU gold linker.") | |||
set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold") | |||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold") | |||
endif() | |||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}") | |||
if(NOT MGE_WITH_JIT) | |||
if(MGE_WITH_HALIDE) | |||
@@ -353,11 +374,28 @@ if(MGE_WITH_CUDA) | |||
if(NOT MGE_ENABLE_EXCEPTIONS) | |||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions") | |||
endif() | |||
if(NOT MGE_CUDA_GENCODE) | |||
if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386") | |||
set(MEGDNN_THREADS_512 0) | |||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") | |||
if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||
message(WARNING "Static link CUDNN8 with many sm is unworkable, we only enable sm61 sm70 sm75 by default, and enable MGE_WITH_LARGE_ARCHIVE=ON") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.1.0") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=sm_86") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=compute_86") | |||
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.0.0") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=compute_80") | |||
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60") | |||
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61") | |||
@@ -385,7 +423,6 @@ if(MGE_WITH_CUDA) | |||
endif() | |||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}") | |||
include(cmake/cudnn.cmake) | |||
if(MGE_WITH_TRT) | |||
include(cmake/tensorrt.cmake) | |||
endif() | |||
@@ -394,12 +431,30 @@ if(MGE_WITH_CUDA) | |||
if(MSVC OR WIN32) | |||
list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY}) | |||
message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}") | |||
else() | |||
if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7) | |||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer myelin_compiler_static myelin_executor_static myelin_pattern_runtime_static myelin_pattern_library_static -Wl,--no-whole-archive) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer -Wl,--no-whole-archive) | |||
endif() | |||
endif() | |||
endif() | |||
if("${CUDNN_VERSION}" STREQUAL "7.5.0") | |||
if(MSVC OR WIN32) | |||
message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | |||
list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive) | |||
message(STATUS "cudnn 7.5.0 has bug in cudnnConvolutionBiasActivationForward, need --whole-archive to workaround, ref https://docs.nvidia.com/deeplearning/cudnn/release-notes/rel_7xx.html") | |||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) | |||
endif() | |||
else() | |||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) | |||
if(MSVC OR WIN32) | |||
message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | |||
list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY}) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS libcudnn) | |||
endif() | |||
endif() | |||
if(MSVC OR WIN32) | |||
list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | |||
@@ -447,15 +502,37 @@ if(MGE_WITH_CUDA) | |||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | |||
list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand) | |||
endif() | |||
list(APPEND MGE_CUDA_LIBS cudart) | |||
endif() | |||
if(NOT MGE_WITH_CUDA_STUB) | |||
if(MSVC OR WIN32) | |||
list(APPEND MGE_CUDA_LIBS cuda.lib) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS cuda) | |||
endif() | |||
endif() | |||
if(NOT MGE_WITH_NVRTC_STUB) | |||
if(MSVC OR WIN32) | |||
list(APPEND MGE_CUDA_LIBS nvrtc.lib) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS nvrtc) | |||
endif() | |||
endif() | |||
if(MGE_WITH_ANY_CUDA_STUB) | |||
add_subdirectory(dnn/cuda-stub) | |||
list(APPEND MGE_CUDA_LIBS cuda-stub) | |||
endif() | |||
add_subdirectory(dnn/cuda-stub) | |||
if(MSVC OR WIN32) | |||
list(APPEND MGE_CUDA_LIBS nvrtc.lib cuda-stub) | |||
list(APPEND MGE_CUDA_LIBS nvrtc.lib) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt) | |||
list(APPEND MGE_CUDA_LIBS nvToolsExt) | |||
endif() | |||
set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}") | |||
set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -lrt") | |||
endif() | |||
if(MGE_WITH_CAMBRICON) | |||
@@ -800,6 +877,9 @@ if(TARGET _imperative_rt) | |||
COMMAND ${CMAKE_COMMAND} -E create_symlink | |||
${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | |||
${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}> | |||
COMMAND ${CMAKE_COMMAND} -E create_symlink | |||
${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py | |||
${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py | |||
DEPENDS _imperative_rt | |||
VERBATIM | |||
) | |||
@@ -863,3 +943,9 @@ if(MGE_WITH_JIT_MLIR) | |||
add_subdirectory(tools/mlir/mgb-opt) | |||
add_subdirectory(tools/mlir/mgb-file-check) | |||
endif() | |||
if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED)) | |||
message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||
message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||
message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ") | |||
endif() |
@@ -11,7 +11,7 @@ if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "") | |||
set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR}) | |||
endif() | |||
if(MGE_CUDA_USE_STATIC) | |||
if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED) | |||
find_library(CUDNN_LIBRARY | |||
NAMES libcudnn_static.a cudnn.lib | |||
PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} | |||
@@ -42,7 +42,12 @@ if(CUDNN_INCLUDE_DIR STREQUAL "CUDNN_INCLUDE_DIR-NOTFOUND") | |||
message(FATAL_ERROR "Can not find CuDNN Library") | |||
endif() | |||
file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) | |||
if(EXISTS ${CUDNN_INCLUDE_DIR}/cudnn_version.h) | |||
file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS) | |||
else() | |||
file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) | |||
endif() | |||
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)" | |||
CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | |||
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1" | |||
@@ -55,7 +60,9 @@ string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)" | |||
CUDNN_PATCH_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") | |||
string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1" | |||
CUDNN_PATCH_VERSION "${CUDNN_PATCH_VERSION}") | |||
set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}) | |||
set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCH_VERSION}) | |||
if(MGE_CUDA_USE_STATIC) | |||
add_library(libcudnn STATIC IMPORTED) | |||
@@ -1,11 +1,21 @@ | |||
file (GLOB_RECURSE SOURCES src/*.cpp) | |||
file (GLOB_RECURSE CUDA_STUB src/libcuda.cpp) | |||
file (GLOB_RECURSE NVRTC_STUB src/libnvrtc.cpp) | |||
if(MGE_WITH_CUDA_STUB) | |||
list(APPEND STUB_SRC ${CUDA_STUB}) | |||
endif() | |||
if(MGE_WITH_NVRTC_STUB) | |||
list(APPEND STUB_SRC ${NVRTC_STUB}) | |||
endif() | |||
if(MSVC OR WIN32) | |||
add_library (cuda-stub STATIC ${SOURCES}) | |||
add_library (cuda-stub STATIC ${STUB_SRC}) | |||
else() | |||
add_library (cuda-stub SHARED ${SOURCES}) | |||
add_library (cuda-stub SHARED ${STUB_SRC}) | |||
endif() | |||
set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda) | |||
set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda_stub) | |||
target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) | |||
if (MSVC OR WIN32) | |||
target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined) | |||
@@ -0,0 +1,109 @@ | |||
#if defined(_WIN32) | |||
#include <windows.h> | |||
#define RTLD_LAZY 0 | |||
static void* dlopen(const char* file, int) { | |||
return static_cast<void*>(LoadLibraryA(file)); | |||
} | |||
static void* dlerror() { | |||
const char* errmsg = "dlerror not aviable in windows"; | |||
return const_cast<char*>(errmsg); | |||
} | |||
static void* dlsym(void* handle, const char* name) { | |||
FARPROC symbol = GetProcAddress((HMODULE)handle, name); | |||
return reinterpret_cast<void*>(symbol); | |||
} | |||
#else | |||
#include <dlfcn.h> | |||
#include <unistd.h> | |||
#endif | |||
#include <sstream> | |||
#include <string> | |||
#include <vector> | |||
static std::vector<std::string> split_string(const std::string& s, char delim) { | |||
std::vector<std::string> elems; | |||
std::stringstream ss(s); | |||
std::string item; | |||
while (std::getline(ss, item, delim)) { | |||
elems.push_back(item); | |||
} | |||
return elems; | |||
} | |||
static std::vector<std::string> get_env_dir(const char* env_name) { | |||
const char* env_p = std::getenv(env_name); | |||
std::vector<std::string> env_dir; | |||
if (env_p) { | |||
env_dir = split_string(env_p, ':'); | |||
} | |||
return env_dir; | |||
} | |||
static void* try_open_handle(std::vector<std::string> dir_vec, | |||
std::string default_so_name) { | |||
void* handle = nullptr; | |||
for (auto& tk_path : dir_vec) { | |||
handle = dlopen((tk_path + "/" + default_so_name).c_str(), RTLD_LAZY); | |||
if (handle) { | |||
break; | |||
} | |||
} | |||
return handle; | |||
} | |||
static void* try_open_handle(const char** so_vec, int nr_so) { | |||
void* handle = nullptr; | |||
for (int i = 0; i < nr_so; ++i) { | |||
handle = dlopen(so_vec[i], RTLD_LAZY); | |||
if (handle) { | |||
break; | |||
} | |||
} | |||
return handle; | |||
} | |||
static void* get_library_handle() { | |||
std::vector<std::string> cuda_tk_dir = get_env_dir("CUDA_TK_PATH"); | |||
std::vector<std::string> ld_dir = get_env_dir("LD_LIBRARY_PATH"); | |||
void* handle = nullptr; | |||
if (!handle) { | |||
handle = try_open_handle(ld_dir, default_so_name); | |||
} | |||
if (!handle) { | |||
handle = try_open_handle(cuda_tk_dir, default_so_name); | |||
} | |||
if (!handle) { | |||
handle = try_open_handle(default_so_paths, | |||
sizeof(default_so_paths) / sizeof(char*)); | |||
} | |||
if (!handle) { | |||
handle = try_open_handle(extra_so_paths, | |||
sizeof(extra_so_paths) / sizeof(char*)); | |||
} | |||
if (!handle) { | |||
LOGE("Failed to load %s API library", g_default_api_name); | |||
return nullptr; | |||
} | |||
return handle; | |||
} | |||
static void log_failed_load(int func_idx) { | |||
LOGE("failed to load %s func: %s", g_default_api_name, | |||
g_func_name[func_idx]); | |||
} | |||
static void* resolve_library_func(void* handle, const char* func) { | |||
if (!handle) { | |||
LOGE("%s handle should not be nullptr!", g_default_api_name); | |||
return nullptr; | |||
} | |||
auto ret = dlsym(handle, func); | |||
if (!ret) { | |||
LOGE("failed to load %s func: %s", g_default_api_name, func); | |||
} | |||
return ret; | |||
} |
@@ -3,36 +3,14 @@ | |||
#include <cstdio> | |||
#define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) | |||
extern "C" { | |||
#include <cuda.h> | |||
#include "cuda.h" | |||
} | |||
#include <cudaProfiler.h> | |||
#include "cudaProfiler.h" | |||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||
#if defined(_WIN32) | |||
#include <windows.h> | |||
#define RTLD_LAZY 0 | |||
static void* dlopen(const char* file, int) { | |||
return static_cast<void*>(LoadLibraryA(file)); | |||
} | |||
static void* dlerror() { | |||
const char* errmsg = "dlerror not aviable in windows"; | |||
return const_cast<char*>(errmsg); | |||
} | |||
static void* dlsym(void* handle, const char* name) { | |||
FARPROC symbol = GetProcAddress((HMODULE)handle, name); | |||
return reinterpret_cast<void*>(symbol); | |||
} | |||
#else | |||
#include <dlfcn.h> | |||
#include <unistd.h> | |||
#endif | |||
static void log_failed_load(int func_idx); | |||
namespace { | |||
template <typename T> | |||
@@ -42,68 +20,63 @@ CUresult on_init_failed(int func_idx) { | |||
log_failed_load(func_idx); | |||
return CUDA_ERROR_UNKNOWN; | |||
} | |||
} | |||
} // namespace | |||
#define _WRAPLIB_API_CALL CUDAAPI | |||
#define _WRAPLIB_CALLBACK CUDA_CB | |||
#include "./libcuda-wrap.h" | |||
#if CUDA_VERSION == 10010 | |||
#include "./libcuda-wrap_10.1.h" | |||
#elif CUDA_VERSION == 10020 | |||
#include "./libcuda-wrap_10.2.h" | |||
#elif CUDA_VERSION == 11010 | |||
#include "./libcuda-wrap_11.1.h" | |||
#elif CUDA_VERSION == 11020 | |||
#include "./libcuda-wrap_11.2.h" | |||
#else | |||
#error "cuda stub not support this cuda version, you can close cuda stub to passby" | |||
#endif | |||
#undef _WRAPLIB_CALLBACK | |||
#undef _WRAPLIB_API_CALL | |||
static const char* default_so_name = | |||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||
"nvcuda.dll"; | |||
#elif defined(__APPLE__) || defined(__MACOSX) | |||
"libcuda.dylib"; | |||
#else | |||
"libcuda.so.1"; | |||
#endif | |||
// Harvested from cuda_drvapi_dynlink.c | |||
static const char* default_so_paths[] = { | |||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||
"nvcuda.dll", | |||
#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX) | |||
"nvcuda.dll", | |||
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || \ | |||
defined(__MACOSX) | |||
#if defined(__APPLE__) || defined(__MACOSX) | |||
"/usr/local/cuda/lib/libcuda.dylib", | |||
"/usr/local/cuda/lib/libcuda.dylib", | |||
#elif defined(__ANDROID__) | |||
#if defined (__aarch64__) | |||
"/system/vendor/lib64/libcuda.so", | |||
#if defined(__aarch64__) | |||
"/system/vendor/lib64/libcuda.so", | |||
#elif defined(__arm__) | |||
"/system/vendor/lib/libcuda.so", | |||
"/system/vendor/lib/libcuda.so", | |||
#endif | |||
#else | |||
"libcuda.so.1", | |||
// In case some users does not have correct search path configured in | |||
// /etc/ld.so.conf | |||
"/usr/lib/x86_64-linux-gnu/libcuda.so", | |||
"/usr/local/nvidia/lib64/libcuda.so", | |||
"libcuda.so.1", | |||
#endif | |||
#else | |||
#error "Unknown platform" | |||
#endif | |||
}; | |||
static void* get_library_handle() { | |||
void* handle = nullptr; | |||
for (size_t i = 0; i < (sizeof(default_so_paths) / sizeof(char*)); i++) { | |||
handle = dlopen(default_so_paths[i], RTLD_LAZY); | |||
if (handle) { | |||
break; | |||
} | |||
} | |||
if (!handle) { | |||
LOGE("Failed to load CUDA Driver API library"); | |||
return nullptr; | |||
} | |||
return handle; | |||
} | |||
static void log_failed_load(int func_idx) { | |||
LOGE("failed to load cuda func: %s", g_func_name[func_idx]); | |||
} | |||
static const char* extra_so_paths[] = { | |||
"/usr/lib/x86_64-linux-gnu/libcuda.so", | |||
"/usr/local/nvidia/lib64/libcuda.so", | |||
}; | |||
static void* resolve_library_func(void* handle, const char* func) { | |||
if (!handle) { | |||
LOGE("handle should not be nullptr!"); | |||
return nullptr; | |||
} | |||
auto ret = dlsym(handle, func); | |||
if (!ret) { | |||
LOGE("failed to load cuda func: %s", func); | |||
} | |||
return ret; | |||
} | |||
static const char* g_default_api_name = "cuda"; | |||
#include "./dlopen_helper.h" |
@@ -0,0 +1,367 @@ | |||
// generated by wraplib.py | |||
// --- begin functions to be implemented | |||
#ifndef _WRAPLIB_API_CALL | |||
#define _WRAPLIB_API_CALL | |||
#endif | |||
#ifndef _WRAPLIB_CALLBACK | |||
#define _WRAPLIB_CALLBACK | |||
#endif | |||
#ifndef ON_ENTRY | |||
#define ON_ENTRY(x) | |||
#endif | |||
static void* get_library_handle(); | |||
static void* resolve_library_func(void*, const char*); | |||
namespace { | |||
template <typename T> | |||
T on_init_failed(int func_idx); | |||
} | |||
// --- end functions to be implemented | |||
#include <cstddef> | |||
#include <mutex> | |||
extern "C" { | |||
const char _WRAPLIB_API_CALL* nvrtcGetErrorString(nvrtcResult arg0); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcVersion(int* arg0, int* arg1); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs(int* arg0); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs(int* arg0); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram(nvrtcProgram* arg0, | |||
const char* arg1, | |||
const char* arg2, int arg3, | |||
const char* const* arg4, | |||
const char* const* arg5); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram(nvrtcProgram* arg0); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcCompileProgram(nvrtcProgram arg0, int arg1, | |||
const char* const* arg2); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize(nvrtcProgram arg0, size_t* arg1); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX(nvrtcProgram arg0, char* arg1); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize(nvrtcProgram arg0, | |||
size_t* arg1); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN(nvrtcProgram arg0, char* arg1); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize(nvrtcProgram arg0, | |||
size_t* arg1); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog(nvrtcProgram arg0, char* arg1); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcAddNameExpression(nvrtcProgram arg0, | |||
const char* const arg1); | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName(nvrtcProgram arg0, | |||
const char* const arg1, | |||
const char** arg2); | |||
} | |||
static void load_library(); | |||
static const char _WRAPLIB_API_CALL* nvrtcGetErrorString_init( | |||
nvrtcResult arg0) { | |||
load_library(); | |||
return nvrtcGetErrorString(arg0); | |||
} | |||
static const char _WRAPLIB_API_CALL* nvrtcGetErrorString_error(nvrtcResult) { | |||
return on_init_failed<const char*>(0); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcVersion_init(int* arg0, int* arg1) { | |||
load_library(); | |||
return nvrtcVersion(arg0, arg1); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcVersion_error(int*, int*) { | |||
return on_init_failed<nvrtcResult>(1); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs_init(int* arg0) { | |||
load_library(); | |||
return nvrtcGetNumSupportedArchs(arg0); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs_error(int*) { | |||
return on_init_failed<nvrtcResult>(2); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs_init(int* arg0) { | |||
load_library(); | |||
return nvrtcGetSupportedArchs(arg0); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs_error(int*) { | |||
return on_init_failed<nvrtcResult>(3); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram_init( | |||
nvrtcProgram* arg0, const char* arg1, const char* arg2, int arg3, | |||
const char* const* arg4, const char* const* arg5) { | |||
load_library(); | |||
return nvrtcCreateProgram(arg0, arg1, arg2, arg3, arg4, arg5); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL | |||
nvrtcCreateProgram_error(nvrtcProgram*, const char*, const char*, int, | |||
const char* const*, const char* const*) { | |||
return on_init_failed<nvrtcResult>(4); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL | |||
nvrtcDestroyProgram_init(nvrtcProgram* arg0) { | |||
load_library(); | |||
return nvrtcDestroyProgram(arg0); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram_error(nvrtcProgram*) { | |||
return on_init_failed<nvrtcResult>(5); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL | |||
nvrtcCompileProgram_init(nvrtcProgram arg0, int arg1, const char* const* arg2) { | |||
load_library(); | |||
return nvrtcCompileProgram(arg0, arg1, arg2); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL | |||
nvrtcCompileProgram_error(nvrtcProgram, int, const char* const*) { | |||
return on_init_failed<nvrtcResult>(6); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize_init(nvrtcProgram arg0, | |||
size_t* arg1) { | |||
load_library(); | |||
return nvrtcGetPTXSize(arg0, arg1); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize_error(nvrtcProgram, | |||
size_t*) { | |||
return on_init_failed<nvrtcResult>(7); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX_init(nvrtcProgram arg0, | |||
char* arg1) { | |||
load_library(); | |||
return nvrtcGetPTX(arg0, arg1); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX_error(nvrtcProgram, char*) { | |||
return on_init_failed<nvrtcResult>(8); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize_init(nvrtcProgram arg0, | |||
size_t* arg1) { | |||
load_library(); | |||
return nvrtcGetCUBINSize(arg0, arg1); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize_error(nvrtcProgram, | |||
size_t*) { | |||
return on_init_failed<nvrtcResult>(9); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN_init(nvrtcProgram arg0, | |||
char* arg1) { | |||
load_library(); | |||
return nvrtcGetCUBIN(arg0, arg1); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN_error(nvrtcProgram, char*) { | |||
return on_init_failed<nvrtcResult>(10); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL | |||
nvrtcGetProgramLogSize_init(nvrtcProgram arg0, size_t* arg1) { | |||
load_library(); | |||
return nvrtcGetProgramLogSize(arg0, arg1); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize_error(nvrtcProgram, | |||
size_t*) { | |||
return on_init_failed<nvrtcResult>(11); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog_init(nvrtcProgram arg0, | |||
char* arg1) { | |||
load_library(); | |||
return nvrtcGetProgramLog(arg0, arg1); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog_error(nvrtcProgram, | |||
char*) { | |||
return on_init_failed<nvrtcResult>(12); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL | |||
nvrtcAddNameExpression_init(nvrtcProgram arg0, const char* const arg1) { | |||
load_library(); | |||
return nvrtcAddNameExpression(arg0, arg1); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL | |||
nvrtcAddNameExpression_error(nvrtcProgram, const char* const) { | |||
return on_init_failed<nvrtcResult>(13); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName_init( | |||
nvrtcProgram arg0, const char* const arg1, const char** arg2) { | |||
load_library(); | |||
return nvrtcGetLoweredName(arg0, arg1, arg2); | |||
} | |||
static nvrtcResult _WRAPLIB_API_CALL | |||
nvrtcGetLoweredName_error(nvrtcProgram, const char* const, const char**) { | |||
return on_init_failed<nvrtcResult>(14); | |||
} | |||
static constexpr size_t NR_FUNC = 15; | |||
static void* g_func_table[NR_FUNC] = {(void*)(&nvrtcGetErrorString_init), | |||
(void*)(&nvrtcVersion_init), | |||
(void*)(&nvrtcGetNumSupportedArchs_init), | |||
(void*)(&nvrtcGetSupportedArchs_init), | |||
(void*)(&nvrtcCreateProgram_init), | |||
(void*)(&nvrtcDestroyProgram_init), | |||
(void*)(&nvrtcCompileProgram_init), | |||
(void*)(&nvrtcGetPTXSize_init), | |||
(void*)(&nvrtcGetPTX_init), | |||
(void*)(&nvrtcGetCUBINSize_init), | |||
(void*)(&nvrtcGetCUBIN_init), | |||
(void*)(&nvrtcGetProgramLogSize_init), | |||
(void*)(&nvrtcGetProgramLog_init), | |||
(void*)(&nvrtcAddNameExpression_init), | |||
(void*)(&nvrtcGetLoweredName_init)}; | |||
static void* g_func_table_error[NR_FUNC] = { | |||
(void*)(&nvrtcGetErrorString_error), | |||
(void*)(&nvrtcVersion_error), | |||
(void*)(&nvrtcGetNumSupportedArchs_error), | |||
(void*)(&nvrtcGetSupportedArchs_error), | |||
(void*)(&nvrtcCreateProgram_error), | |||
(void*)(&nvrtcDestroyProgram_error), | |||
(void*)(&nvrtcCompileProgram_error), | |||
(void*)(&nvrtcGetPTXSize_error), | |||
(void*)(&nvrtcGetPTX_error), | |||
(void*)(&nvrtcGetCUBINSize_error), | |||
(void*)(&nvrtcGetCUBIN_error), | |||
(void*)(&nvrtcGetProgramLogSize_error), | |||
(void*)(&nvrtcGetProgramLog_error), | |||
(void*)(&nvrtcAddNameExpression_error), | |||
(void*)(&nvrtcGetLoweredName_error)}; | |||
static const char* const g_func_name[NR_FUNC] = {"nvrtcGetErrorString", | |||
"nvrtcVersion", | |||
"nvrtcGetNumSupportedArchs", | |||
"nvrtcGetSupportedArchs", | |||
"nvrtcCreateProgram", | |||
"nvrtcDestroyProgram", | |||
"nvrtcCompileProgram", | |||
"nvrtcGetPTXSize", | |||
"nvrtcGetPTX", | |||
"nvrtcGetCUBINSize", | |||
"nvrtcGetCUBIN", | |||
"nvrtcGetProgramLogSize", | |||
"nvrtcGetProgramLog", | |||
"nvrtcAddNameExpression", | |||
"nvrtcGetLoweredName"}; | |||
static void load_library() { | |||
static bool done = false; | |||
static std::mutex mtx; | |||
std::lock_guard<std::mutex> lg{mtx}; | |||
if (done) | |||
return; | |||
void* handle = get_library_handle(); | |||
for (size_t i = 0; i < NR_FUNC; ++i) { | |||
void* func; | |||
if (!handle) { | |||
func = nullptr; | |||
} else { | |||
func = resolve_library_func(handle, g_func_name[i]); | |||
} | |||
if (!func) { | |||
func = g_func_table_error[i]; | |||
} | |||
__atomic_store_n(g_func_table + i, func, __ATOMIC_RELAXED); | |||
} | |||
done = true; | |||
} | |||
const char _WRAPLIB_API_CALL* nvrtcGetErrorString(nvrtcResult arg0) { | |||
typedef const char*(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcResult); | |||
ON_ENTRY(nvrtcGetErrorString); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[0]); | |||
return f(arg0); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcVersion(int* arg0, int* arg1) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*, int*); | |||
ON_ENTRY(nvrtcVersion); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[1]); | |||
return f(arg0, arg1); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs(int* arg0) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*); | |||
ON_ENTRY(nvrtcGetNumSupportedArchs); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[2]); | |||
return f(arg0); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs(int* arg0) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*); | |||
ON_ENTRY(nvrtcGetSupportedArchs); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[3]); | |||
return f(arg0); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram(nvrtcProgram* arg0, | |||
const char* arg1, | |||
const char* arg2, int arg3, | |||
const char* const* arg4, | |||
const char* const* arg5) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)( | |||
nvrtcProgram*, const char*, const char*, int, const char* const*, | |||
const char* const*); | |||
ON_ENTRY(nvrtcCreateProgram); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[4]); | |||
return f(arg0, arg1, arg2, arg3, arg4, arg5); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram(nvrtcProgram* arg0) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram*); | |||
ON_ENTRY(nvrtcDestroyProgram); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[5]); | |||
return f(arg0); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcCompileProgram(nvrtcProgram arg0, int arg1, | |||
const char* const* arg2) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, int, | |||
const char* const*); | |||
ON_ENTRY(nvrtcCompileProgram); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[6]); | |||
return f(arg0, arg1, arg2); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize(nvrtcProgram arg0, size_t* arg1) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||
ON_ENTRY(nvrtcGetPTXSize); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[7]); | |||
return f(arg0, arg1); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX(nvrtcProgram arg0, char* arg1) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||
ON_ENTRY(nvrtcGetPTX); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[8]); | |||
return f(arg0, arg1); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize(nvrtcProgram arg0, | |||
size_t* arg1) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||
ON_ENTRY(nvrtcGetCUBINSize); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[9]); | |||
return f(arg0, arg1); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN(nvrtcProgram arg0, char* arg1) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||
ON_ENTRY(nvrtcGetCUBIN); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[10]); | |||
return f(arg0, arg1); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize(nvrtcProgram arg0, | |||
size_t* arg1) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*); | |||
ON_ENTRY(nvrtcGetProgramLogSize); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[11]); | |||
return f(arg0, arg1); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog(nvrtcProgram arg0, | |||
char* arg1) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*); | |||
ON_ENTRY(nvrtcGetProgramLog); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[12]); | |||
return f(arg0, arg1); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcAddNameExpression(nvrtcProgram arg0, | |||
const char* const arg1) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, | |||
const char* const); | |||
ON_ENTRY(nvrtcAddNameExpression); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[13]); | |||
return f(arg0, arg1); | |||
} | |||
nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName(nvrtcProgram arg0, | |||
const char* const arg1, | |||
const char** arg2) { | |||
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)( | |||
nvrtcProgram, const char* const, const char**); | |||
ON_ENTRY(nvrtcGetLoweredName); | |||
f_ptr_t f = (f_ptr_t)(g_func_table[14]); | |||
return f(arg0, arg1, arg2); | |||
} |
@@ -0,0 +1,75 @@ | |||
/** | |||
* \file dnn/cuda-stub/src/libnvrtc.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma GCC visibility push(default) | |||
#include <cstdio> | |||
#define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v) | |||
#include "./nvrtc_type.h" | |||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||
static void log_failed_load(int func_idx); | |||
namespace { | |||
template <typename T> | |||
T on_init_failed(int func_idx); | |||
template <> | |||
nvrtcResult on_init_failed(int func_idx) { | |||
log_failed_load(func_idx); | |||
return NVRTC_ERROR_INTERNAL_ERROR; | |||
} | |||
template <> | |||
const char* on_init_failed(int func_idx) { | |||
log_failed_load(func_idx); | |||
return "load lib failed"; | |||
} | |||
} // namespace | |||
#include "./libnvrtc-wrap.h" | |||
static const char* default_so_name = | |||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||
"nvrtc.dll"; | |||
#elif defined(__APPLE__) || defined(__MACOSX) | |||
"libnvrtc.dylib"; | |||
#else | |||
"libnvrtc.so"; | |||
#endif | |||
static const char* default_so_paths[] = { | |||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) | |||
"nvrtc.dll", | |||
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || \ | |||
defined(__MACOSX) | |||
#if defined(__APPLE__) || defined(__MACOSX) | |||
"/usr/local/cuda/lib/libnvrtc.dylib", | |||
#elif defined(__ANDROID__) | |||
#if defined(__aarch64__) | |||
"/system/vendor/lib64/libnvrtc.so", | |||
#elif defined(__arm__) | |||
"/system/vendor/lib/libnvrtc.so", | |||
#endif | |||
#else | |||
"libnvrtc.so", | |||
// In case some users does not have correct search path configured in | |||
// /etc/ld.so.conf | |||
"/usr/lib/x86_64-linux-gnu/libnvrtc.so", | |||
"/usr/local/nvidia/lib64/libnvrtc.so", | |||
"/usr/local/cuda/lib64/libnvrtc.so", | |||
#endif | |||
#else | |||
#error "Unknown platform" | |||
#endif | |||
}; | |||
static const char* extra_so_paths[] = {}; | |||
static const char* g_default_api_name = "nvrtc"; | |||
#include "./dlopen_helper.h" |
@@ -0,0 +1,27 @@ | |||
#pragma once | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif /* __cplusplus */ | |||
#include <stdlib.h> | |||
typedef enum { | |||
NVRTC_SUCCESS = 0, | |||
NVRTC_ERROR_OUT_OF_MEMORY = 1, | |||
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, | |||
NVRTC_ERROR_INVALID_INPUT = 3, | |||
NVRTC_ERROR_INVALID_PROGRAM = 4, | |||
NVRTC_ERROR_INVALID_OPTION = 5, | |||
NVRTC_ERROR_COMPILATION = 6, | |||
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, | |||
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, | |||
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, | |||
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, | |||
NVRTC_ERROR_INTERNAL_ERROR = 11 | |||
} nvrtcResult; | |||
typedef struct _nvrtcProgram *nvrtcProgram; | |||
#ifdef __cplusplus | |||
} | |||
#endif /* __cplusplus */ |
@@ -26,6 +26,7 @@ static inline CUBLASLTMatmulDesc::SizeArgs from_local_size_args( | |||
return {handle, transA, transB, | |||
args.layout_a, args.layout_b, args.layout_c}; | |||
} | |||
bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( | |||
const SizeArgs& args) const { | |||
auto cublasLt_args = from_local_size_args(args); | |||
@@ -35,6 +36,7 @@ bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available( | |||
.is_available(cublasLt_args, INT_MAX); | |||
return res; | |||
} | |||
size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( | |||
const SizeArgs& args) const { | |||
auto cublasLt_args = from_local_size_args(args); | |||
@@ -43,6 +45,7 @@ size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes( | |||
desc.get_algorithm_heuristic(cublasLt_args, INT_MAX, algo); | |||
return desc.get_workspace_bundle(cublasLt_args, algo).total_size_in_bytes(); | |||
} | |||
void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||
const ExecArgs& args) const { | |||
auto cublasLt_args = from_local_size_args(args); | |||
@@ -89,6 +92,7 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||
desc.layout_c, &algo, ws_bundle.get(0), | |||
ws_bundle.get_size(0), stream)); | |||
}; | |||
auto batched_igemm = [&]() { | |||
auto zero = handle->zero_device(); | |||
auto one = handle->one_device(); | |||
@@ -133,6 +137,18 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||
}; | |||
ws_bundle.set(args.workspace.raw_ptr); | |||
#if CUDA_VERSION >= 11000 | |||
if (desc.dt_compute == CUBLAS_COMPUTE_32I) { | |||
batched_igemm(); | |||
} else if (desc.dt_compute == CUBLAS_COMPUTE_16F) { | |||
batched_hgemm(); | |||
} else if (desc.dt_compute == CUBLAS_COMPUTE_32F) { | |||
batched_sgemm(); | |||
} else { | |||
megdnn_throw( | |||
megdnn_mangle("compute_type must be int32/float16/float32")); | |||
} | |||
#else | |||
if (desc.dt_compute == CUDA_R_32I) { | |||
batched_igemm(); | |||
} else if (desc.dt_compute == CUDA_R_16F) { | |||
@@ -143,5 +159,6 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec( | |||
megdnn_throw( | |||
megdnn_mangle("compute_type must be int32/float16/float32")); | |||
} | |||
#endif | |||
} | |||
#endif |
@@ -156,6 +156,9 @@ std::string ConvBiasForwardImpl::AlgoBase::SizeArgs::to_string() const { | |||
case param::ConvBias::NonlineMode::IDENTITY: | |||
nonlinear_mode_str = "IDENTITY"; | |||
break; | |||
case param::ConvBias::NonlineMode::H_SWISH: | |||
nonlinear_mode_str = "H_SWISH"; | |||
break; | |||
default: | |||
megdnn_throw("invalid conv bias nonlinear mode"); | |||
} | |||
@@ -165,6 +165,23 @@ void TensorDesc::set(const TensorLayout& layout, | |||
} | |||
} | |||
std::string TensorDesc::to_string() { | |||
cudnnDataType_t data_type; | |||
int n; | |||
int c; | |||
int h; | |||
int w; | |||
int n_stride; | |||
int c_stride; | |||
int h_stride; | |||
int w_stride; | |||
cudnn_check(cudnnGetTensor4dDescriptor(desc, &data_type, &n, &c, &h, &w, | |||
&n_stride, &c_stride, &h_stride, | |||
&w_stride)); | |||
return ssprintf("<dtype_%d, %d,%d,%d,%d(%d,%d,%d,%d)>", data_type, n, c, h, | |||
w, n_stride, c_stride, h_stride, w_stride); | |||
} | |||
template <typename Param> | |||
FilterDesc<Param>::FilterDesc() { | |||
cudnn_check(cudnnCreateFilterDescriptor(&desc)); | |||
@@ -176,6 +193,20 @@ FilterDesc<Param>::~FilterDesc() { | |||
} | |||
template <typename Param> | |||
std::string FilterDesc<Param>::to_string() { | |||
cudnnDataType_t data_type; | |||
cudnnTensorFormat_t format; | |||
int k; | |||
int c; | |||
int h; | |||
int w; | |||
cudnn_check(cudnnGetFilter4dDescriptor(desc, &data_type, &format, &k, &c, | |||
&h, &w)); | |||
return ssprintf("<dtype_%d, format_%d, %d,%d,%d,%d>", data_type,format, k, c, h, | |||
w); | |||
} | |||
template <typename Param> | |||
void FilterDesc<Param>::set( | |||
const typename ConvolutionBase<Param>::CanonizedFilterMeta& | |||
filter_meta) { | |||
@@ -30,6 +30,7 @@ class TensorDesc { | |||
//! default layout is nchw | |||
void set(const TensorLayout& layout, const param::Convolution::Format = | |||
param::Convolution::Format::NCHW); | |||
std::string to_string(); | |||
~TensorDesc(); | |||
cudnnTensorDescriptor_t desc; | |||
}; | |||
@@ -39,6 +40,7 @@ class FilterDesc { | |||
public: | |||
FilterDesc(); | |||
void set(const typename ConvolutionBase<Param>::CanonizedFilterMeta &meta); | |||
std::string to_string(); | |||
~FilterDesc(); | |||
cudnnFilterDescriptor_t desc; | |||
}; | |||
@@ -25,6 +25,10 @@ using namespace cuda; | |||
#define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF | |||
#endif | |||
#if CUDA_VERSION < 11000 | |||
#define CUBLAS_COMPUTE_32I CUDA_R_32I | |||
#endif | |||
bool MatrixMulForwardImpl::AlgoCuBlas::is_available( | |||
const SizeArgs& args) const { | |||
if (args.opr->param().format != param::MatrixMul::Format::DEFAULT) | |||
@@ -117,7 +121,7 @@ void MatrixMulForwardImpl::AlgoCuBlas::exec(const ExecArgs& args) const { | |||
args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr, | |||
CUDA_R_8I, args.tensor_a.layout.stride[0], zero, | |||
args.tensor_c.raw_ptr, CUDA_R_32I, | |||
args.tensor_c.layout.stride[0], CUDA_R_32I, CUBLAS_GEMM_DFALT)); | |||
args.tensor_c.layout.stride[0], CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DFALT)); | |||
}; | |||
// Note that cublas takes column-major matrices as inputs, | |||
@@ -6,10 +6,11 @@ | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "src/cuda/matrix_mul/cublasLt_wrapper.h" | |||
#include "src/common/utils.h" | |||
#include "src/cuda/matrix_mul/cublasLt_wrapper.h" | |||
#include "src/cuda/utils.h" | |||
#if CUDA_VERSION >= 10010 | |||
@@ -33,6 +34,7 @@ static cudaDataType_t to_cuda_dtype(DType tp) { | |||
} | |||
} | |||
#if CUDA_VERSION >= 11000 | |||
static cublasComputeType_t to_cublas_compute_type(DType tp) { | |||
switch (tp.enumv()) { | |||
case DTypeEnum::Float16: | |||
@@ -43,10 +45,11 @@ static cublasComputeType_t to_cublas_compute_type(DType tp) { | |||
case DTypeEnum::QuantizedS32: | |||
return CUBLAS_COMPUTE_32I; | |||
default: | |||
megdnn_throw(megdnn_mangle( | |||
"dtype must be float16/float32/int32/Qs32")); | |||
megdnn_throw( | |||
megdnn_mangle("dtype must be float16/float32/int32/Qs32")); | |||
} | |||
} | |||
#endif | |||
static const char* cuda_type_to_str(cudaDataType_t tp) { | |||
switch (tp) { | |||
@@ -106,9 +109,15 @@ void CUBLASLTMatmulDesc::set(const SizeArgs& args, bool batched) { | |||
dt_b = to_cuda_dtype(args.layout_b.dtype); | |||
dt_a = to_cuda_dtype(args.layout_a.dtype); | |||
dt_c = to_cuda_dtype(args.layout_c.dtype); | |||
dt_compute = to_cublas_compute_type(args.layout_c.dtype); | |||
megdnn_assert(dt_a == dt_b, "matrix A and B should have same precision"); | |||
#if CUDA_VERSION >= 11000 | |||
dt_compute = to_cublas_compute_type(args.layout_c.dtype); | |||
cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute, dt_c)); | |||
#else | |||
dt_compute = dt_c; | |||
cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute)); | |||
#endif | |||
cublas_check(cublasLtMatmulDescSetAttribute( | |||
matmul_desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pm, sizeof(pm))); | |||
@@ -262,8 +271,7 @@ WorkspaceBundle CUBLASLTMatmulDesc::get_workspace_bundle( | |||
dt_c == CUDA_R_32I ? layout_trans_b : layout_b, | |||
dt_c == CUDA_R_32I ? layout_trans_a : layout_a, | |||
dt_c == CUDA_R_32I ? layout_trans_c : layout_c, | |||
dt_c == CUDA_R_32I ? layout_trans_c : layout_c, &algo, | |||
&result); | |||
dt_c == CUDA_R_32I ? layout_trans_c : layout_c, &algo, &result); | |||
// return empty WorkspaceBundle if cublasLtMatmulAlgoCheck() failed | |||
if (status != CUBLAS_STATUS_SUCCESS) | |||
return {nullptr, {}}; | |||
@@ -48,7 +48,11 @@ struct CUBLASLTMatmulDesc { | |||
bool is_batched; | |||
cublasLtMatmulDesc_t matmul_desc; | |||
cudaDataType_t dt_a, dt_b, dt_c; | |||
#if CUDA_VERSION >= 11000 | |||
cublasComputeType_t dt_compute; | |||
#else | |||
cudaDataType_t dt_compute; | |||
#endif | |||
cublasLtMatrixLayout_t layout_a, layout_b, layout_c; | |||
cublasLtMatrixLayout_t layout_trans_a, layout_trans_b, layout_trans_c; | |||
size_t workspace_a, workspace_b, workspace_c; | |||
@@ -128,7 +128,23 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { | |||
stream)); | |||
cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc)); | |||
}; | |||
switch(desc.dt_compute) { | |||
#if CUDA_VERSION >= 11000 | |||
switch (desc.dt_compute) { | |||
case CUBLAS_COMPUTE_16F: | |||
hgemm(); | |||
break; | |||
case CUBLAS_COMPUTE_32F: | |||
sgemm(); | |||
break; | |||
case CUBLAS_COMPUTE_32I: | |||
igemm(); | |||
break; | |||
default: | |||
megdnn_throw(megdnn_mangle( | |||
"compute type must be float16/float32/int32")); | |||
} | |||
#else | |||
switch (desc.dt_compute) { | |||
case CUDA_R_16F: | |||
hgemm(); | |||
break; | |||
@@ -139,8 +155,10 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const { | |||
igemm(); | |||
break; | |||
default: | |||
megdnn_throw(megdnn_mangle("compute type must be float16/float32/int32")); | |||
megdnn_throw(megdnn_mangle( | |||
"compute type must be float16/float32/int32")); | |||
} | |||
#endif | |||
} | |||
#endif | |||
// vim: syntax=cpp.doxygen |
@@ -309,6 +309,9 @@ void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args, | |||
arg.f / (1e12); | |||
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, | |||
filter{arg.co, arg.ci, arg.f, arg.f}; | |||
if (!algo){ | |||
algo = "no_name"; | |||
} | |||
printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, " | |||
"time(cudnn)=%.2f %.2fTops, time(batched_matmul)=%.2f " | |||
"%.2fTops, " | |||
@@ -1,2 +1,3 @@ | |||
Makefile | |||
/test/imperative_test | |||
python/megengine/version.py |
@@ -70,3 +70,8 @@ add_custom_command( | |||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt | |||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt | |||
) | |||
add_custom_command( | |||
TARGET ${MODULE_NAME} POST_BUILD | |||
COMMAND "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/python/gen_version.py --output ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/version.py | |||
) |
@@ -0,0 +1,31 @@ | |||
import argparse | |||
import os | |||
import subprocess | |||
def get_git_commit(src_dir): | |||
try: | |||
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=src_dir).decode('ascii').strip() | |||
except Exception: | |||
return 'unknown' | |||
def get_mge_version(version_txt_path): | |||
v = {} | |||
with open(version_txt_path) as fp: | |||
exec(fp.read(), v) | |||
return v | |||
if __name__ == "__main__": | |||
parser = argparse.ArgumentParser(description="generate version.py to build path") | |||
parser.add_argument("--output", type=str, required=True) | |||
args = parser.parse_args() | |||
python_dir = os.path.dirname(__file__) | |||
version_txt_path = os.path.join(python_dir, 'version_template.py') | |||
commit_id = get_git_commit(python_dir) | |||
mge_ver_map = get_mge_version(version_txt_path) | |||
mge_ver = mge_ver_map['__version__'] if '__version__' in mge_ver_map else 'unknown' | |||
mge_intl = mge_ver_map['__internal__'] if '__internal__' in mge_ver_map else False | |||
with open(args.output, 'w') as f: | |||
f.write("__version__ = '{}'\n".format(mge_ver)) | |||
f.write("git_version = {}\n".format(repr(commit_id))) | |||
if mge_intl: | |||
f.write("__internal__ = True\n") |
@@ -0,0 +1,10 @@ | |||
# -*- coding: utf-8 -*- | |||
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
# | |||
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
# | |||
# Unless required by applicable law or agreed to in writing, | |||
# software distributed under the License is distributed on an | |||
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
__version__ = "1.3.0.dev" | |||
@@ -8,7 +8,7 @@ | |||
```bash | |||
1: please refer to: https://docs.docker.com/engine/security/rootless/ to enable rootless docker env | |||
2: cd ./scripts/whl/manylinux2010 | |||
2: cd ./scripts/whl/manylinux2014 | |||
3: ./build_image.sh | |||
``` | |||
@@ -56,24 +56,25 @@ | |||
``` | |||
# How to build | |||
Note: Guarantee the git repo is mounted in docker container, do not use `git submodule update --init` in to init megbrain repo | |||
## Build for linux | |||
* MegBrain delivers `wheel` package with `manylinux2010` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/). | |||
* MegBrain delivers `wheel` package with `manylinux2014` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/). | |||
commands: | |||
```bash | |||
export CUDA_ROOT_DIR=/path/to/cuda | |||
export CUDNN_ROOT_DIR=/path/to/cudnn | |||
export TENSORRT_ROOT_DIR=/path/to/tensorrt | |||
./scripts/whl/manylinux2010/build_wheel.sh | |||
./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||
``` | |||
* And you can find all of the outputs in `output` directory.If you just want to build for a specific Python verison, you can use `ALL_PYTHON` environment variable. eg: | |||
```bash | |||
ALL_PYTHON="36m" ./scripts/whl/manylinux2010/build_wheel.sh | |||
ALL_PYTHON="36m" ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||
``` | |||
* If you just want to build with cpu only version, you can set `BUILD_WHL_CPU_ONLY` environment 'ON'. eg: | |||
```bash | |||
BUILD_WHL_CPU_ONLY="ON" ALL_PYTHON="36m" ./scripts/whl/manylinux2010/build_wheel.sh | |||
BUILD_WHL_CPU_ONLY="ON" ALL_PYTHON="36m" ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101 | |||
``` | |||
## Build for MacOS | |||
@@ -0,0 +1,15 @@ | |||
FROM quay.io/pypa/manylinux2014_x86_64:2020-12-31-56195b3 | |||
ENV UID=1024 \ | |||
PATH=${PATH}:/usr/local/cuda/bin \ | |||
LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \ | |||
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \ | |||
CPATH=${CPATH}:/usr/local/cuda/include:/opt/cudnn/include:/opt/tensorrt/include | |||
ARG platform | |||
COPY setup_mirror.sh . | |||
RUN ./setup_mirror.sh "$platform" | |||
ADD init_image.sh /tmp | |||
RUN /tmp/init_image.sh && rm -f /tmp/init_image.sh | |||
@@ -0,0 +1,5 @@ | |||
#!/bin/bash -e | |||
cd $(dirname $0) | |||
docker build -t env_manylinux2014:latest . |
@@ -0,0 +1,217 @@ | |||
#!/bin/bash | |||
set -e | |||
CWD=$(dirname $0) | |||
BASEDIR=$(readlink -f ${CWD}/../../..) | |||
OUTPUTDIR=$(readlink -f ${CWD}/output) | |||
USERID=$(id -u) | |||
TMPFS_ARGS="--tmpfs /tmp:exec" | |||
local_path=$(dirname $(readlink -f $0)) | |||
CUDNN_LIB_DIR="/opt/cudnn/lib64/" | |||
CUDA_LIB_DIR="/usr/local/cuda/lib64/" | |||
CUDA_SDK="unknown" | |||
function usage() { | |||
echo "use '-sdk cu111' to specify cuda toolkit config, also support cu101, cu112" | |||
} | |||
while [ "$1" != "" ]; do | |||
case $1 in | |||
-sdk) | |||
shift | |||
CUDA_SDK=$1 | |||
shift | |||
;; | |||
*) | |||
usage | |||
exit 1 | |||
esac | |||
done | |||
echo "Build with ${CUDA_SDK}" | |||
if [ $CUDA_SDK == "cu101" ];then | |||
COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" | |||
OUT_DIR="cu101" | |||
BUILD_GCC8="ON" | |||
REQUIR_CUDA_VERSION="10010" | |||
REQUIR_CUDNN_VERSION="7.6.3" | |||
REQUIR_TENSORRT_VERSION="6.0.1.5" | |||
elif [ $CUDA_SDK == "cu111" ];then | |||
COPY_LIB_LIST="\ | |||
${CUDA_LIB_DIR}/libnvrtc.so.11.1:\ | |||
${CUDA_LIB_DIR}/libcublasLt.so.11:\ | |||
${CUDA_LIB_DIR}/libcublas.so.11:\ | |||
${CUDNN_LIB_DIR}/libcudnn_adv_infer.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_adv_train.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_cnn_infer.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_cnn_train.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\ | |||
-gencode arch=compute_61,code=sm_61 \ | |||
arch=compute_70,code=sm_70 \ | |||
arch=compute_75,code=sm_75 \ | |||
arch=compute_80,code=sm_80 \ | |||
arch=compute_86,code=sm_86 \ | |||
arch=compute_86,code=compute_86" | |||
OUT_DIR="cu111" | |||
REQUIR_CUDA_VERSION="11010" | |||
REQUIR_CUDNN_VERSION="8.0.5" | |||
REQUIR_TENSORRT_VERSION="7.2.2.3" | |||
elif [ $CUDA_SDK == "cu112" ];then | |||
COPY_LIB_LIST="\ | |||
${CUDA_LIB_DIR}/libnvrtc.so.11.2:\ | |||
${CUDA_LIB_DIR}/libcublasLt.so.11:\ | |||
${CUDA_LIB_DIR}/libcublas.so.11:\ | |||
${CUDNN_LIB_DIR}/libcudnn_adv_infer.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_adv_train.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_cnn_infer.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_cnn_train.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \ | |||
-gencode arch=compute_61,code=sm_61 \ | |||
arch=compute_70,code=sm_70 \ | |||
arch=compute_75,code=sm_75 \ | |||
arch=compute_80,code=sm_80 \ | |||
arch=compute_86,code=sm_86 \ | |||
arch=compute_86,code=compute_86" | |||
OUT_DIR="cu112" | |||
REQUIR_CUDA_VERSION="11020" | |||
REQUIR_CUDNN_VERSION="8.0.5" | |||
REQUIR_TENSORRT_VERSION="7.2.2.3" | |||
else | |||
echo "no support sdk ${CUDA_SDK}, please set by '-sdk cu111'" | |||
exit -1 | |||
fi | |||
BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} | |||
if [[ -z ${BUILD_WHL_CPU_ONLY} ]] | |||
then | |||
BUILD_WHL_CPU_ONLY="OFF" | |||
fi | |||
echo ${BASEDIR} | |||
pushd ${BASEDIR}/third_party >/dev/null | |||
./prepare.sh | |||
popd >/dev/null | |||
cd ${CWD} | |||
mkdir -p ${OUTPUTDIR} | |||
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||
if [[ -z ${CUDA_ROOT_DIR} ]]; then | |||
echo "Environment variable CUDA_ROOT_DIR not set." | |||
exit -1 | |||
fi | |||
if [[ -z ${CUDNN_ROOT_DIR} ]]; then | |||
echo "Environment variable CUDNN_ROOT_DIR not set." | |||
exit -1 | |||
fi | |||
if [[ -z ${TENSORRT_ROOT_DIR} ]]; then | |||
echo "Environment variable TENSORRT_ROOT_DIR not set." | |||
exit -1 | |||
fi | |||
## YOU SHOULD MODIFY CUDA VERSION AS BELOW WHEN UPGRADE | |||
CUDA_ROOT_DIR_=${CUDA_ROOT_DIR%*/} | |||
CUDNN_ROOT_DIR_=${CUDNN_ROOT_DIR%*/} | |||
TENSORRT_ROOT_DIR_=${TENSORRT_ROOT_DIR%*/} | |||
CUDA_VERSION_PATH=${CUDA_ROOT_DIR_}/include/cuda.h | |||
if [ "$REQUIR_CUDA_VERSION" -ge "11000" ];then | |||
CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn_version.h | |||
else | |||
CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn.h | |||
fi | |||
TENSORRT_VERSION_PATH=${TENSORRT_ROOT_DIR_}/include/NvInferVersion.h | |||
if [ ! -e $CUDA_VERSION_PATH ] ; then | |||
echo file $CUDA_VERSION_PATH is not exist | |||
echo please check the Environment must use CUDA-$REQUIR_CUDA_VERSION | |||
exit -1 | |||
fi | |||
if [ ! -e $CUDNN_VERSION_PATH ] ; then | |||
echo file $CUDNN_VERSION_PATH is not exist | |||
echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION | |||
exit -1 | |||
fi | |||
if [ ! -e $TENSORRT_VERSION_PATH ] ; then | |||
echo file $TENSORRT_VERSION_PATH is not exist | |||
echo please check the Environment must use TensorRT-$REQUIR_TENSORRT_VERSION | |||
exit -1 | |||
fi | |||
CUDA_VERSION_CONTEXT=$(head -300 ${CUDA_VERSION_PATH}) | |||
CUDNN_VERSION_CONTEXT=$(head -62 ${CUDNN_VERSION_PATH}) | |||
TENSORRT_VERSION_CONTEXT=$(tail -12 ${TENSORRT_VERSION_PATH}) | |||
if [ "$REQUIR_CUDA_VERSION" -ge "11000" ];then | |||
CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define CUDA_VERSION * +([0-9]+)") | |||
else | |||
CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define __CUDA_API_VERSION * +([0-9]+)") | |||
fi | |||
CUDA_VERSION=${CUDA_API_VERSION:0-5} | |||
echo CUDA_VERSION:$CUDA_VERSION | |||
CUDNN_VERSION_MAJOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MAJOR * +([0-9]+)") | |||
CUDNN_VERSION_MINOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MINOR * +([0-9]+)") | |||
CUDNN_VERSION_PATCH=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_PATCHLEVEL * +([0-9]+)") | |||
CUDNN_VERSION=${CUDNN_VERSION_MAJOR:0-1}.${CUDNN_VERSION_MINOR:0-1}.${CUDNN_VERSION_PATCH:0-1} | |||
echo CUDNN_VERSION:$CUDNN_VERSION | |||
TENSORRT_VERSION_MAJOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MAJOR * +([0-9]+)") | |||
TENSORRT_VERSION_MINOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MINOR * +([0-9]+)") | |||
TENSORRT_VERSION_PATCH=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_PATCH * +([0-9]+)") | |||
TENSORRT_VERSION_BUILD=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_BUILD * +([0-9]+)") | |||
TENSORRT_VERSION=${TENSORRT_VERSION_MAJOR:0-1}.${TENSORRT_VERSION_MINOR:0-1}.${TENSORRT_VERSION_PATCH:0-1}.${TENSORRT_VERSION_BUILD:0-1} | |||
echo TENSORRT_VERSION:$TENSORRT_VERSION | |||
if [ $CUDA_VERSION != $REQUIR_CUDA_VERSION ] ; then | |||
echo please check the Environment must use CUDA-10.1 NO.$REQUIR_CUDA_VERSION | |||
exit -1 | |||
fi | |||
if [ $CUDNN_VERSION != $REQUIR_CUDNN_VERSION ] ; then | |||
echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION | |||
exit -1 | |||
fi | |||
if [ $TENSORRT_VERSION != $REQUIR_TENSORRT_VERSION ] ; then | |||
echo please check the Environment must use TENSORRT-$REQUIR_TENSORRT_VERSION | |||
exit -1 | |||
fi | |||
fi | |||
if [[ -z ${BUILD_GCC8} ]];then | |||
BUILD_GCC8=OFF | |||
fi | |||
if [ "$BUILD_GCC8" == "ON" ];then | |||
run_cmd="scl enable devtoolset-8 /home/code/scripts/whl/manylinux2014/do_build_common.sh" | |||
else | |||
run_cmd="/home/code/scripts/whl/manylinux2014/do_build_common.sh" | |||
fi | |||
docker run --rm -it $TMPFS_ARGS \ | |||
-e UID=${USERID} \ | |||
-e LOCAL_VERSION=${LOCAL_VERSION} \ | |||
-e BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} \ | |||
-e ALL_PYTHON="${ALL_PYTHON}" \ | |||
-e EXTRA_CMAKE_FLAG="$EXTRA_CMAKE_FLAG" \ | |||
-e COPY_LIB_LIST="$COPY_LIB_LIST" \ | |||
-e OUT_DIR="$OUT_DIR" \ | |||
-v ${CUDA_ROOT_DIR}:/usr/local/cuda \ | |||
-v ${CUDNN_ROOT_DIR}:/opt/cudnn \ | |||
-v ${TENSORRT_ROOT_DIR}:/opt/tensorrt \ | |||
-v ${BASEDIR}:/home/code \ | |||
-v ${OUTPUTDIR}:/home/output:rw \ | |||
env_manylinux2014:latest /bin/bash -c "$run_cmd" | |||
@@ -0,0 +1,136 @@ | |||
#!/bin/bash -ex | |||
function handle_strip() { | |||
echo "now handle strip $1" | |||
objcopy --only-keep-debug $1 $1.dbg | |||
strip -s $1 | |||
objcopy --add-gnu-debuglink=$1.dbg $1 | |||
rm $1.dbg | |||
} | |||
function full_copy_so(){ | |||
lib_path=$1 | |||
dst_dir=$2 | |||
append_rpath=$3 | |||
lib_name=$(basename $lib_path) | |||
cp $lib_path $dst_dir/$lib_name | |||
if [ "$append_rpath" != "" ];then | |||
ori_rpath=$(patchelf --print-rpath $dst_dir/$lib_name) | |||
if [ "$ori_rpath" != "" ];then | |||
patchelf --set-rpath "$ori_rpath:$append_rpath" $dst_dir/$lib_name | |||
else | |||
patchelf --set-rpath "$append_rpath" $dst_dir/$lib_name | |||
fi | |||
fi | |||
} | |||
function patch_elf_depend_lib() { | |||
echo "handle common depend lib" | |||
LIBS_DIR=${BUILD_DIR}/staging/megengine/core/lib | |||
mkdir -p ${LIBS_DIR} | |||
cp /usr/lib64/libatomic.so.1 ${LIBS_DIR} | |||
patchelf --remove-rpath ${BUILD_DIR}/staging/megengine/core/_imperative_rt.so | |||
patchelf --force-rpath --set-rpath '$ORIGIN/lib' ${BUILD_DIR}/staging/megengine/core/_imperative_rt.so | |||
cp ${BUILD_DIR}/src/libmegengine_export.so ${LIBS_DIR} | |||
patchelf --remove-rpath ${LIBS_DIR}/libmegengine_export.so | |||
patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so | |||
cp ${BUILD_DIR}/src/libmegengine_export.so ${LIBS_DIR} | |||
patchelf --remove-rpath ${LIBS_DIR}/libmegengine_export.so | |||
patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so | |||
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||
echo "handle cuda lib" | |||
cp ${BUILD_DIR}/dnn/cuda-stub/libcuda_stub.so ${LIBS_DIR} | |||
cp /usr/local/cuda/lib64/libnvToolsExt.so.1 ${LIBS_DIR} | |||
IFS=: read -a lib_name_array <<<"$COPY_LIB_LIST" | |||
append_rpath='$ORIGIN/.' | |||
for lib_name in ${lib_name_array[@]};do | |||
full_copy_so $lib_name ${LIBS_DIR} $lib_append_rpath | |||
done | |||
fi | |||
} | |||
ALL_PYTHON=${ALL_PYTHON} | |||
if [[ -z ${ALL_PYTHON} ]] | |||
then | |||
ALL_PYTHON="35m 36m 37m 38" | |||
fi | |||
BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} | |||
if [[ -z ${BUILD_WHL_CPU_ONLY} ]] | |||
then | |||
BUILD_WHL_CPU_ONLY="OFF" | |||
fi | |||
SRC_DIR=$(readlink -f "`dirname $0`/../../../") | |||
BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/ | |||
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||
BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_ON/MGE_INFERENCE_ONLY_OFF/Release/build/ | |||
fi | |||
NEW_LIB_PATH=core/lib | |||
for ver in ${ALL_PYTHON} | |||
do | |||
USE_AUDITWHEEL="ON" | |||
python_ver=${ver:0:2} | |||
MAJOR=${python_ver:0:1} | |||
MINOR=${ver:1} | |||
PYTHON_DIR=/opt/python/cp${python_ver}-cp${ver}/ | |||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} ${EXTRA_CMAKE_FLAG}" | |||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo" | |||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_PREFIX_PATH=${PYTHON_DIR}" | |||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_EXECUTABLE=${PYTHON_DIR}/bin/python3" | |||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_LIBRARY=${PYTHON_DIR}lib/" | |||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python${MAJOR}.${MINOR}" | |||
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DMGE_WITH_ATLAS=ON" | |||
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then | |||
${SRC_DIR}/scripts/cmake-build/host_build.sh -c -t -r | |||
else | |||
${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r | |||
fi | |||
cd ${BUILD_DIR} | |||
rm -rf staging | |||
mkdir -p staging | |||
cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/ | |||
handle_strip ${BUILD_DIR}/src/libmegengine_export.so | |||
cd ${BUILD_DIR}/staging/megengine/core | |||
handle_strip _imperative_rt.so | |||
mkdir -p lib/ucx | |||
if [ ${USE_AUDITWHEEL} = "OFF" ]; then | |||
patch_elf_depend_lib | |||
fi | |||
cd ${BUILD_DIR}/staging/ | |||
${PYTHON_DIR}/bin/python setup.py bdist_wheel | |||
cd /home/output | |||
if [ ${USE_AUDITWHEEL} = "ON" ]; then | |||
LD_LIBRARY_PATH=${BUILD_DIR}/dnn/cuda-stub:$LD_LIBRARY_PATH auditwheel repair -L ${NEW_LIB_PATH} ${BUILD_DIR}/staging/dist/Meg*.whl | |||
else | |||
mkdir -p ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${OUT_DIR} | |||
cd ${BUILD_DIR}/staging/dist/ | |||
org_whl_name=`ls Meg*${ver}*.whl` | |||
compat_whl_name=`echo ${org_whl_name} | sed 's/linux/manylinux2014/'` | |||
echo "org whl name: ${org_whl_name}" | |||
echo "comapt whl name: ${compat_whl_name}" | |||
mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${OUT_DIR}/${compat_whl_name} | |||
cd /home/output | |||
fi | |||
chown -R ${UID}.${UID} . | |||
# compat for root-less docker env to remove output at host side | |||
chmod -R 777 . | |||
echo "python $ver done" | |||
done | |||
@@ -0,0 +1,70 @@ | |||
#!/bin/bash -e | |||
GET_PIP_URL='https://bootstrap.pypa.io/get-pip.py' | |||
SWIG_URL='https://downloads.sourceforge.net/project/swig/swig/swig-3.0.12/swig-3.0.12.tar.gz?use_mirror=autoselect' | |||
LLVM_URL='https://github.com/llvm-mirror/llvm/archive/release_60.tar.gz' | |||
CLANG_URL='https://github.com/llvm-mirror/clang/archive/release_60.tar.gz' | |||
yum install -y pcre-devel devtoolset-9-libatomic-devel.x86_64 | |||
yum install -y devtoolset-8 devtoolset-8-libatomic-devel.x86_64 | |||
for ver in 35m 36m 37m 38 | |||
do | |||
python_ver=${ver:0:2} | |||
curl ${GET_PIP_URL} | /opt/python/cp${python_ver}-cp${ver}/bin/python - \ | |||
--no-cache-dir --only-binary :all: | |||
/opt/python/cp${python_ver}-cp${ver}/bin/pip install \ | |||
--no-cache-dir --only-binary :all: numpy==1.18.1 setuptools==46.1.3 | |||
done | |||
pushd /home >/dev/null | |||
echo "Install swig" | |||
curl -sSL ${SWIG_URL} | tar xz | |||
pushd swig-3.0.12 >/dev/null | |||
mkdir build | |||
pushd build >/dev/null | |||
../configure | |||
make -j$(nproc) | |||
make install | |||
popd >/dev/null | |||
popd >/dev/null | |||
rm -rf swig-3.0.12 | |||
echo "Install llvm" | |||
curl -sSL ${LLVM_URL} | tar xz | |||
pushd llvm-release_60 >/dev/null | |||
mkdir build | |||
pushd build >/dev/null | |||
cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \ | |||
-DCMAKE_BUILD_TYPE=Release | |||
make -j$(nproc) | |||
make install | |||
popd >/dev/null | |||
popd >/dev/null | |||
rm -rf llvm-release_60 | |||
echo "Install clang" | |||
curl -sSL ${CLANG_URL} | tar xz | |||
pushd clang-release_60 >/dev/null | |||
mkdir build | |||
pushd build >/dev/null | |||
cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \ | |||
-DCMAKE_BUILD_TYPE=Release | |||
make -j$(nproc) | |||
make install | |||
popd >/dev/null | |||
popd >/dev/null | |||
rm -rf clang-release_60 | |||
popd >/dev/null | |||
pushd /tmp >/dev/null | |||
curl -sSL https://github.com/NixOS/patchelf/archive/0.12.tar.gz | tar xz | |||
pushd /tmp/patchelf-0.12 >/dev/null | |||
sed -i '331s/32/64/' ./src/patchelf.cc | |||
./bootstrap.sh && ./configure && make install-strip | |||
popd | |||
rm -rf /tmp/patchelf-0.12 | |||
popd | |||
yum clean all |
@@ -0,0 +1,65 @@ | |||
#!/bin/bash | |||
set -e | |||
function set_tuna_yum_mirror() { | |||
cp /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.bak | |||
local repo=/etc/yum.repos.d/CentOS-Base.repo | |||
local plugin=/etc/yum/pluginconf.d/fastestmirror.conf | |||
sed -i "s/mirrorlist=/#mirrorlist=/g" $repo | |||
sed -i "s/#baseurl/baseurl/g" $repo | |||
sed -i "s/mirror.centos.org/mirrors.tuna.tsinghua.edu.cn/g" $repo | |||
sed -i "s/http/https/g" $repo | |||
sed -i "s/enabled=1/enabled=0/g" $plugin | |||
yum clean all | |||
# Build on brainpp unable to pull epel reo metadata so disable this | |||
# https://unix.stackexchange.com/questions/148144/unable-to-pull-epel-repository-metadata | |||
yum --disablerepo="epel" update nss | |||
yum makecache | |||
} | |||
function set_epel() { | |||
mv /etc/yum.repos.d/epel.repo /etc/yum.repos.d/epel.repo.backup | |||
mv /etc/yum.repos.d/epel-testing.repo /etc/yum.repos.d/epel-testing.repo.backup | |||
curl -o /etc/yum.repos.d/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo | |||
} | |||
function set_yum_mirror() { | |||
mv /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.backup | |||
curl -o /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-7.repo | |||
yum makecache | |||
} | |||
function set_pip_mirror() { | |||
cat > /etc/pip.conf <<EOF | |||
[global] | |||
timeout = 180 | |||
index-url = https://mirrors.aliyun.com/pypi/simple | |||
extra-index-url = | |||
http://mirrors.i.brainpp.cn/pypi/simple/ | |||
http://pypi.i.brainpp.cn/brain/dev/+simple | |||
https://pypi.tuna.tsinghua.edu.cn/simple | |||
trusted-host = | |||
mirrors.i.brainpp.cn | |||
pypi.i.brainpp.cn | |||
pypi.tuna.tsinghua.edu.cn | |||
mirrors.aliyun.com | |||
EOF | |||
} | |||
function main() { | |||
local platform=$1 | |||
case $platform in | |||
brainpp) | |||
set_epel | |||
set_yum_mirror | |||
set_pip_mirror | |||
;; | |||
*) | |||
echo "No setup required" | |||
;; | |||
esac | |||
} | |||
main "$@" |
@@ -81,8 +81,9 @@ void NMSKeep::CUDAKern::exec(const NMSKeep* opr, const DeviceTensorND& inp, | |||
init(opr, inp.shape()); | |||
auto inp_ptr = inp.ptr<float>(); | |||
auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace.raw_ptr()), | |||
dev_rm_mask = reinterpret_cast<uint64_t*>( | |||
void* workspace_ptr = workspace.raw_ptr(); | |||
auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace_ptr), | |||
dev_rm_mask = (uint64_t*)( | |||
workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align); | |||
auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()), | |||
out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>()); | |||
@@ -27,6 +27,9 @@ | |||
#include "megbrain/gopt/inference.h" | |||
#include "megbrain/gopt/misc.h" | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||
using namespace mgb; | |||
using namespace gopt; | |||
using namespace cg; | |||
@@ -1749,6 +1752,7 @@ void mgb::tensorrt::transform_dest_vars_inplace( | |||
optimizer.apply_inplace(dest_vars); | |||
} | |||
#pragma GCC diagnostic pop | |||
#endif | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -20,7 +20,8 @@ | |||
#include "megbrain/utils/debug.h" | |||
#if MGB_ENABLE_TENSOR_RT | |||
#pragma GCC diagnostic push | |||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations" | |||
#include "megbrain/tensorrt/tensorrt_opr.h" | |||
#include "make_trt_net.h" | |||
@@ -111,7 +112,8 @@ intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() { | |||
host_b = range_gen({1, 8, 1, 1}); | |||
{ | |||
float* ptr = reinterpret_cast<float*>(host_w->raw_ptr()); | |||
void* w_ptr = host_w->raw_ptr(); | |||
float* ptr = reinterpret_cast<float*>(w_ptr); | |||
ptr[0] = -127*1.1f; | |||
ptr[1] = 127*1.1f; | |||
} | |||
@@ -362,6 +364,7 @@ intl::ConcatConvTensorRTNetwork::create_trt_network(bool has_batch_dim) { | |||
return std::make_pair(builder, network); | |||
} | |||
#pragma GCC diagnostic pop | |||
#endif // MGB_ENABLE_TENSOR_RT | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |