GitOrigin-RevId: 4d9832e559
tags/v1.0.0-rc1
@@ -116,7 +116,7 @@ endif() | |||||
if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1) | add_compile_definitions(NOMINMAX=1 _USE_MATH_DEFINES=1 WIN32=1) | ||||
message("-- into windows build...") | message("-- into windows build...") | ||||
message(" -- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}") | |||||
message("-- CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}") | |||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang-cl") | if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang-cl") | ||||
message(FATAL_ERROR "only support clang-cl for windows build, pls check detail: scripts/cmake-build/BUILD_README.md") | message(FATAL_ERROR "only support clang-cl for windows build, pls check detail: scripts/cmake-build/BUILD_README.md") | ||||
endif() | endif() | ||||
@@ -131,12 +131,20 @@ if(MSVC OR WIN32) | |||||
set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=zero-as-null-pointer-constant -Wno-error=implicit-int-conversion") | set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=zero-as-null-pointer-constant -Wno-error=implicit-int-conversion") | ||||
set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=float-conversion -Wno-error=shadow-field -Wno-error=covered-switch-default") | set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=float-conversion -Wno-error=shadow-field -Wno-error=covered-switch-default") | ||||
set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=deprecated -Wno-error=documentation -Wno-error=unreachable-code-break") | set(WIN_FLAGS "${WIN_FLAGS} -Wno-error=deprecated -Wno-error=documentation -Wno-error=unreachable-code-break") | ||||
set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /D_WIN32_WINNT=0x0601") | |||||
set(WIN_FLAGS "${WIN_FLAGS} /DWIN32 -Wno-macro-redefined /D_WIN32_WINNT=0x0601 /wd4819") | |||||
set(WIN_FLAGS "${WIN_FLAGS} /D_CRT_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS /DNOGDI /D_USE_MATH_DEFINES /bigobj") | set(WIN_FLAGS "${WIN_FLAGS} /D_CRT_SECURE_NO_DEPRECATE /D_CRT_SECURE_NO_WARNINGS /DNOGDI /D_USE_MATH_DEFINES /bigobj") | ||||
set(WIN_FLAGS "${WIN_FLAGS} /Zm500 /EHs /wd4351 /wd4291 /wd4250 /wd4996 /wd4819 -Wno-inconsistent-dllimport") | set(WIN_FLAGS "${WIN_FLAGS} /Zm500 /EHs /wd4351 /wd4291 /wd4250 /wd4996 /wd4819 -Wno-inconsistent-dllimport") | ||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WIN_FLAGS}") | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${WIN_FLAGS}") | ||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WIN_FLAGS}") | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WIN_FLAGS}") | ||||
#FIXME: fix halide JIT on windows | |||||
message("-- disable jit and halide on windows host build...") | |||||
set(MGE_WITH_HALIDE OFF) | |||||
set(MGE_WITH_JIT OFF) | |||||
#FIXME: fix MegRay on windows | |||||
message("-- Disable distributed build on windows host build...") | |||||
set(MGE_WITH_DISTRIBUTED OFF) | |||||
else() | else() | ||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") | ||||
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") | set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g") | ||||
@@ -286,7 +294,16 @@ if(MGE_WITH_CUDA) | |||||
set(CMAKE_CUDA_FLAGS_RELEASE "-O3") | set(CMAKE_CUDA_FLAGS_RELEASE "-O3") | ||||
set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g") | set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O3 -g") | ||||
set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os") | set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Os") | ||||
set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all") | |||||
if(MSVC OR WIN32) | |||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin -compress-all") | |||||
set(CCBIN_FLAG "${CCBIN_FLAG} /wd4819 /wd4334 /wd4267 /wd4002 /wd4244 /wd4068") | |||||
if(${CMAKE_BUILD_TYPE} STREQUAL "Debug" OR ${CMAKE_BUILD_TYPE} STREQUAL "RelWithDebInfo") | |||||
set(CCBIN_FLAG "${CCBIN_FLAG} -D_ITERATOR_DEBUG_LEVEL=2 -MTd") | |||||
endif() | |||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options \" ${CCBIN_FLAG} \" ") | |||||
else() | |||||
set(CMAKE_CUDA_FLAGS "-Xcompiler -Wall,-Wextra -Xfatbin -compress-all") | |||||
endif() | |||||
if(NOT MGE_ENABLE_RTTI) | if(NOT MGE_ENABLE_RTTI) | ||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti") | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-rtti") | ||||
@@ -332,15 +349,29 @@ if(MGE_WITH_CUDA) | |||||
endif() | endif() | ||||
if(MGE_CUDA_USE_STATIC) | if(MGE_CUDA_USE_STATIC) | ||||
if(MGE_WITH_TRT) | if(MGE_WITH_TRT) | ||||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive) | |||||
if(MSVC OR WIN32) | |||||
list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY}) | |||||
message("-- windows TRT_LIBRARY: ${TRT_LIBRARY}") | |||||
message("-- windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}") | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive) | |||||
endif() | |||||
else() | else() | ||||
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) | list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive) | ||||
endif() | endif() | ||||
list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) | |||||
if(MSVC OR WIN32) | |||||
list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) | |||||
endif() | |||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | ||||
list(APPEND MGE_CUDA_LIBS cublasLt_static) | |||||
if(MSVC OR WIN32) | |||||
list(APPEND MGE_CUDA_LIBS cublasLt.lib) | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS cublasLt_static) | |||||
endif() | |||||
endif() | endif() | ||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") | |||||
if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) | |||||
# mark all symbols from liblapack_static.a as weak to avoid | # mark all symbols from liblapack_static.a as weak to avoid | ||||
# duplicated definition with mkl | # duplicated definition with mkl | ||||
find_library( | find_library( | ||||
@@ -377,7 +408,11 @@ if(MGE_WITH_CUDA) | |||||
endif() | endif() | ||||
add_subdirectory(dnn/cuda-stub) | add_subdirectory(dnn/cuda-stub) | ||||
list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt) | |||||
if(MSVC OR WIN32) | |||||
list(APPEND MGE_CUDA_LIBS nvrtc.lib cuda-stub) | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt) | |||||
endif() | |||||
set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}") | set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}") | ||||
endif() | endif() | ||||
@@ -699,3 +734,22 @@ if (NOT MGE_WITH_DISTRIBUTED) | |||||
${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake | ${CMAKE_CURRENT_BINARY_DIR}/MegEngineConfigVersion.cmake | ||||
DESTINATION ${MGE_INSTALL_CMAKEDIR}) | DESTINATION ${MGE_INSTALL_CMAKEDIR}) | ||||
endif() | endif() | ||||
if(MSVC OR WIN32) | |||||
add_compile_options( | |||||
$<$<CONFIG:>:/MT> | |||||
$<$<CONFIG:Debug>:/MTd> | |||||
$<$<CONFIG:Release>:/MT> | |||||
) | |||||
foreach (CompilerFlag | |||||
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE | |||||
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO | |||||
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE | |||||
CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) | |||||
if(${CompilerFlag} MATCHES "/MD") | |||||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||||
set(${CompilerFlag} "${${CompilerFlag}}" CACHE STRING "msvc compiler flags" FORCE) | |||||
message("MSVC flags: ${CompilerFlag}:${${CompilerFlag}}") | |||||
endif() | |||||
endforeach() | |||||
endif() |
@@ -9,7 +9,7 @@ endif() | |||||
if(MGE_CUDA_USE_STATIC) | if(MGE_CUDA_USE_STATIC) | ||||
find_library(CUDNN_LIBRARY | find_library(CUDNN_LIBRARY | ||||
NAMES libcudnn_static.a libcudnn_static.lib | |||||
NAMES libcudnn_static.a cudnn.lib | |||||
PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} | PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX} | ||||
HINTS ${SYSTEM_LIBRARY_PATHS} | HINTS ${SYSTEM_LIBRARY_PATHS} | ||||
PATH_SUFFIXES lib lib64 | PATH_SUFFIXES lib lib64 | ||||
@@ -30,7 +30,7 @@ endif() | |||||
get_filename_component(__found_cudnn_root ${CUDNN_LIBRARY}/../.. REALPATH) | get_filename_component(__found_cudnn_root ${CUDNN_LIBRARY}/../.. REALPATH) | ||||
find_path(CUDNN_INCLUDE_DIR | find_path(CUDNN_INCLUDE_DIR | ||||
NAMES cudnn.h | NAMES cudnn.h | ||||
HINTS ${PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root} | |||||
HINTS $ENV{PC_CUDNN_INCLUDE_DIRS} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_INCLUDE} ${__found_cudnn_root} | |||||
PATH_SUFFIXES include | PATH_SUFFIXES include | ||||
DOC "Path to CUDNN include directory." ) | DOC "Path to CUDNN include directory." ) | ||||
@@ -1,17 +1,17 @@ | |||||
if($ENV{LIBRARY_PATH}) | |||||
if(NOT "$ENV{LIBRARY_PATH}" STREQUAL "") | |||||
string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH}) | string(REPLACE ":" ";" SYSTEM_LIBRARY_PATHS $ENV{LIBRARY_PATH}) | ||||
endif() | endif() | ||||
if(MGE_CUDA_USE_STATIC) | if(MGE_CUDA_USE_STATIC) | ||||
find_library(TRT_LIBRARY | find_library(TRT_LIBRARY | ||||
NAMES libnvinfer_static.a libnvinfer_static.lib | |||||
NAMES libnvinfer_static.a nvinfer.lib | |||||
PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} | PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} | ||||
HINTS ${SYSTEM_LIBRARY_PATHS} | HINTS ${SYSTEM_LIBRARY_PATHS} | ||||
PATH_SUFFIXES lib lib64 | PATH_SUFFIXES lib lib64 | ||||
DOC "TRT library." ) | DOC "TRT library." ) | ||||
else() | else() | ||||
find_library(TRT_LIBRARY | find_library(TRT_LIBRARY | ||||
NAMES libnvinfer.so libnvinfer.dylib | |||||
NAMES libnvinfer.so libnvinfer.dylib nvinfer.dll | |||||
PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} | PATHS $ENV{LD_LIBRARY_PATH} ${TRT_ROOT_DIR} ${CMAKE_INSTALL_PREFIX} | ||||
HINTS ${SYSTEM_LIBRARY_PATHS} | HINTS ${SYSTEM_LIBRARY_PATHS} | ||||
PATH_SUFFIXES lib lib64 | PATH_SUFFIXES lib lib64 | ||||
@@ -1,7 +1,15 @@ | |||||
file (GLOB_RECURSE SOURCES src/*.cpp) | file (GLOB_RECURSE SOURCES src/*.cpp) | ||||
add_library (cuda-stub SHARED ${SOURCES}) | |||||
if(MSVC OR WIN32) | |||||
add_library (cuda-stub STATIC ${SOURCES}) | |||||
else() | |||||
add_library (cuda-stub SHARED ${SOURCES}) | |||||
endif() | |||||
set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda) | set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda) | ||||
target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) | target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL) | ||||
target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined) | |||||
if (MSVC OR WIN32) | |||||
target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined) | |||||
else() | |||||
target_link_libraries(cuda-stub PRIVATE dl -Wl,--no-undefined) | |||||
endif() | |||||
install (TARGETS cuda-stub EXPORT ${MGE_EXPORT_TARGETS}) | install (TARGETS cuda-stub EXPORT ${MGE_EXPORT_TARGETS}) |
@@ -140,4 +140,9 @@ | |||||
#define MEGDNN_DEVICE | #define MEGDNN_DEVICE | ||||
#endif | #endif | ||||
#if defined(_MSC_VER) || defined(WIN32) | |||||
#define ATTR_ALIGNED(v) __declspec(align(v)) | |||||
#else | |||||
#define ATTR_ALIGNED(v) __attribute__((aligned(v))) | |||||
#endif | |||||
// vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen |
@@ -215,9 +215,9 @@ struct TensorLayout : public TensorShape { | |||||
DType dtype; | DType dtype; | ||||
Format format; | Format format; | ||||
#if MEGDNN_CC_HOST | |||||
TensorLayout(); | TensorLayout(); | ||||
#if MEGDNN_CC_HOST | |||||
TensorLayout(const TensorLayout& layout) = default; | TensorLayout(const TensorLayout& layout) = default; | ||||
//! create empty layout with given dtype | //! create empty layout with given dtype | ||||
@@ -275,51 +275,52 @@ __global__ void kern_fwd_half(__half* dst, const __half* src, | |||||
#define SET_SW(func, type, sw) \ | #define SET_SW(func, type, sw) \ | ||||
if (param.flt_h == 2 && param.flt_w == 2) { \ | if (param.flt_h == 2 && param.flt_w == 2) { \ | ||||
kern = func<type, 1, 2, 2, sw>; \ | |||||
f_struct.f = func<type, 1, 2, 2, sw>; \ | |||||
} else if (param.flt_h == 3 && param.flt_w == 3) { \ | } else if (param.flt_h == 3 && param.flt_w == 3) { \ | ||||
kern = func<type, 1, 3, 3, sw>; \ | |||||
f_struct.f = func<type, 1, 3, 3, sw>; \ | |||||
} else if (param.flt_h == 5 && param.flt_w == 5) { \ | } else if (param.flt_h == 5 && param.flt_w == 5) { \ | ||||
kern = func<type, 1, 5, 5, sw>; \ | |||||
f_struct.f = func<type, 1, 5, 5, sw>; \ | |||||
} else if (param.flt_h == 7 && param.flt_w == 7) { \ | } else if (param.flt_h == 7 && param.flt_w == 7) { \ | ||||
kern = func<type, 1, 7, 7, sw>; \ | |||||
f_struct.f = func<type, 1, 7, 7, sw>; \ | |||||
} else { \ | } else { \ | ||||
kern = func<type, 1, 0, 0, sw>; \ | |||||
f_struct.f = func<type, 1, 0, 0, sw>; \ | |||||
} | } | ||||
#define GET_KERN(func, type) \ | |||||
void (*kern)(type*, const type*, const type*, Param); \ | |||||
if (param.chl_mul == 1) { \ | |||||
if (param.stride_w == 1) { \ | |||||
SET_SW(func, type, 1) \ | |||||
} else { \ | |||||
SET_SW(func, type, 0) \ | |||||
} \ | |||||
} else { \ | |||||
kern = func<type, 0, 0, 0, 0>; \ | |||||
} \ | |||||
return kern; | |||||
#define GET_KERN(func, type) \ | |||||
FixFunction<type> f_struct; \ | |||||
if (param.chl_mul == 1) { \ | |||||
if (param.stride_w == 1) { \ | |||||
SET_SW(func, type, 1) \ | |||||
} else { \ | |||||
SET_SW(func, type, 0) \ | |||||
} \ | |||||
} else { \ | |||||
f_struct.f = func<type, 0, 0, 0, 0>; \ | |||||
} \ | |||||
return f_struct; | |||||
template <typename T> | template <typename T> | ||||
void (*get_kern(const Param& param))(T*, const T*, const T*, const Param); | |||||
struct FixFunction { | |||||
void (*f)(T*, const T*, const T*, Param); | |||||
}; | |||||
template <typename T> | |||||
FixFunction<T> get_kern(const Param& param); | |||||
template <> | template <> | ||||
void (*get_kern<float>(const Param& param))(float*, const float*, const float*, | |||||
const Param) { | |||||
FixFunction<float> get_kern<float>(const Param& param) { | |||||
GET_KERN(kern_fwd_float, float); | GET_KERN(kern_fwd_float, float); | ||||
} | } | ||||
#if CUDA_VERSION >= 9000 | #if CUDA_VERSION >= 9000 | ||||
template <> | template <> | ||||
void (*get_kern<__half>(const Param& param))(__half*, const __half*, | |||||
const __half*, const Param) { | |||||
FixFunction<__half> get_kern<__half>(const Param& param) { | |||||
GET_KERN(kern_fwd_half, __half); | GET_KERN(kern_fwd_half, __half); | ||||
} | } | ||||
#endif | #endif | ||||
template <> | template <> | ||||
void (*get_kern<dt_float16>(const Param& param))(dt_float16*, const dt_float16*, | |||||
const dt_float16*, | |||||
const Param) { | |||||
FixFunction<dt_float16> get_kern<dt_float16>(const Param& param) { | |||||
GET_KERN(kern_fwd_float, dt_float16); | GET_KERN(kern_fwd_float, dt_float16); | ||||
} | } | ||||
@@ -337,7 +338,7 @@ template <typename T> | |||||
void run_fwd(T* dst, const T* src, const T* flt, const Param& param, | void run_fwd(T* dst, const T* src, const T* flt, const Param& param, | ||||
cudaStream_t stream) { | cudaStream_t stream) { | ||||
void (*kern)(T*, const T*, const T*, Param); | void (*kern)(T*, const T*, const T*, Param); | ||||
kern = get_kern<T>(param); | |||||
kern = get_kern<T>(param).f; | |||||
int nr_thread = query_blocksize_for_kernel(kern), | int nr_thread = query_blocksize_for_kernel(kern), | ||||
nr_out_dimx = param.out_h * param.out_w * param.batch * param.chl_mul; | nr_out_dimx = param.out_h * param.out_w * param.batch * param.chl_mul; | ||||
@@ -178,25 +178,29 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, | |||||
__half2 dst2 = {0.0, 0.0}; | __half2 dst2 = {0.0, 0.0}; | ||||
if (static_cast<uint32_t>(ow) < | if (static_cast<uint32_t>(ow) < | ||||
static_cast<uint32_t>(owmin_y)) { | static_cast<uint32_t>(owmin_y)) { | ||||
dst2 = {*(pd + ow), 0.0}; | |||||
dst2.x = *(pd + ow); | |||||
dst2.y = 0.0; | |||||
sum = fma2(dst2, flt3, sum); | sum = fma2(dst2, flt3, sum); | ||||
++ow; | ++ow; | ||||
--fw; | --fw; | ||||
} | } | ||||
if (static_cast<uint32_t>(owmax_x) < | if (static_cast<uint32_t>(owmax_x) < | ||||
static_cast<uint32_t>(owmax)) { | static_cast<uint32_t>(owmax)) { | ||||
dst2 = {0.0, *(pd + owmax)}; | |||||
dst2.x = 0.0; | |||||
dst2.y = *(pd + owmax); | |||||
sum = fma2(dst2, flt0, sum); | sum = fma2(dst2, flt0, sum); | ||||
} | } | ||||
if (static_cast<uint32_t>(fw) == 1) { | if (static_cast<uint32_t>(fw) == 1) { | ||||
dst2 = {*(pd + ow), *(pd + ow)}; | |||||
dst2.x = *(pd + ow); | |||||
dst2.y = *(pd + ow); | |||||
sum = fma2(dst2, flt2, sum); | sum = fma2(dst2, flt2, sum); | ||||
++ow; | ++ow; | ||||
--fw; | --fw; | ||||
} | } | ||||
if (static_cast<uint32_t>(ow) <= | if (static_cast<uint32_t>(ow) <= | ||||
static_cast<uint32_t>(owmax_x)) { | static_cast<uint32_t>(owmax_x)) { | ||||
dst2 = {*(pd + ow), *(pd + ow)}; | |||||
dst2.x = *(pd + ow); | |||||
dst2.y = *(pd + ow); | |||||
sum = fma2(dst2, flt1, sum); | sum = fma2(dst2, flt1, sum); | ||||
} | } | ||||
@@ -218,18 +222,21 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, | |||||
__half2 dst2 = {0.0, 0.0}; | __half2 dst2 = {0.0, 0.0}; | ||||
if (static_cast<uint32_t>(ow) < | if (static_cast<uint32_t>(ow) < | ||||
static_cast<uint32_t>(owmin_y)) { | static_cast<uint32_t>(owmin_y)) { | ||||
dst2 = {*(pd + ow), 0.0}; | |||||
dst2.x = *(pd + ow); | |||||
dst2.y = 0.0; | |||||
sum = fma2(dst2, flt5, sum); | sum = fma2(dst2, flt5, sum); | ||||
++ow; | ++ow; | ||||
--fw; | --fw; | ||||
} | } | ||||
if (static_cast<uint32_t>(owmax_x) < | if (static_cast<uint32_t>(owmax_x) < | ||||
static_cast<uint32_t>(owmax)) { | static_cast<uint32_t>(owmax)) { | ||||
dst2 = {0.0, *(pd + owmax)}; | |||||
dst2.x = 0.0; | |||||
dst2.y = *(pd + owmax); | |||||
sum = fma2(dst2, flt0, sum); | sum = fma2(dst2, flt0, sum); | ||||
} | } | ||||
if (static_cast<uint32_t>(fw) == 3) { | if (static_cast<uint32_t>(fw) == 3) { | ||||
dst2 = {*(pd + ow), *(pd + ow)}; | |||||
dst2.x = *(pd + ow); | |||||
dst2.y = *(pd + ow); | |||||
sum = fma2(dst2, flt4, sum); | sum = fma2(dst2, flt4, sum); | ||||
++ow; | ++ow; | ||||
--fw; | --fw; | ||||
@@ -237,7 +244,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, | |||||
if (static_cast<uint32_t>(fw) == 2 && | if (static_cast<uint32_t>(fw) == 2 && | ||||
static_cast<uint32_t>(ow) <= | static_cast<uint32_t>(ow) <= | ||||
static_cast<uint32_t>(owmax_x)) { | static_cast<uint32_t>(owmax_x)) { | ||||
dst2 = {*(pd + ow), *(pd + ow)}; | |||||
dst2.x = *(pd + ow); | |||||
dst2.y = *(pd + ow); | |||||
sum = fma2(dst2, flt3, sum); | sum = fma2(dst2, flt3, sum); | ||||
++ow; | ++ow; | ||||
--fw; | --fw; | ||||
@@ -245,7 +253,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, | |||||
if (static_cast<uint32_t>(fw) == 1 && | if (static_cast<uint32_t>(fw) == 1 && | ||||
static_cast<uint32_t>(ow) <= | static_cast<uint32_t>(ow) <= | ||||
static_cast<uint32_t>(owmax_x)) { | static_cast<uint32_t>(owmax_x)) { | ||||
dst2 = {*(pd + ow), *(pd + ow)}; | |||||
dst2.x = *(pd + ow); | |||||
dst2.y = *(pd + ow); | |||||
sum = fma2(dst2, flt2, sum); | sum = fma2(dst2, flt2, sum); | ||||
++ow; | ++ow; | ||||
--fw; | --fw; | ||||
@@ -253,7 +262,8 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, | |||||
if (static_cast<uint32_t>(fw) == 0 && | if (static_cast<uint32_t>(fw) == 0 && | ||||
static_cast<uint32_t>(ow) <= | static_cast<uint32_t>(ow) <= | ||||
static_cast<uint32_t>(owmax_x)) { | static_cast<uint32_t>(owmax_x)) { | ||||
dst2 = {*(pd + ow), *(pd + ow)}; | |||||
dst2.x = *(pd + ow); | |||||
dst2.y = *(pd + ow); | |||||
sum = fma2(dst2, flt1, sum); | sum = fma2(dst2, flt1, sum); | ||||
} | } | ||||
@@ -270,8 +280,10 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, | |||||
uint32_t fw = iw - ow + PW; | uint32_t fw = iw - ow + PW; | ||||
if (static_cast<uint32_t>(ow) <= | if (static_cast<uint32_t>(ow) <= | ||||
static_cast<uint32_t>(owmax)) { | static_cast<uint32_t>(owmax)) { | ||||
pd2 = {*(pd + ow), *(pd + ow)}; | |||||
pf2 = {0.0, 0.0}; | |||||
pd2.x = *(pd + ow); | |||||
pd2.y = *(pd + ow); | |||||
pf2.x = 0.0; | |||||
pf2.y = 0.0; | |||||
if (static_cast<uint32_t>(ow) >= | if (static_cast<uint32_t>(ow) >= | ||||
static_cast<uint32_t>(owmin_y)) | static_cast<uint32_t>(owmin_y)) | ||||
pf2.y = *(pf + fw + 1); | pf2.y = *(pf + fw + 1); | ||||
@@ -425,16 +437,17 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, | |||||
#define sh param.stride_h | #define sh param.stride_h | ||||
#define sw param.stride_w | #define sw param.stride_w | ||||
#define SET_STRIDE(func, type, chl_mul, fh, fw) \ | |||||
if (sh == 1 && sw == 1) { \ | |||||
kern_ptr = func<type, chl_mul, fh, fw, 1, 1>; \ | |||||
} else if (sh == 2 && sw == 2) { \ | |||||
kern_ptr = func<type, chl_mul, fh, fw, 2, 2>; \ | |||||
} else { \ | |||||
kern_ptr = func<type, chl_mul, fh, fw, 0, 0>; \ | |||||
#define SET_STRIDE(func, type, chl_mul, fh, fw) \ | |||||
if (sh == 1 && sw == 1) { \ | |||||
f_struct.f = func<type, chl_mul, fh, fw, 1, 1>; \ | |||||
} else if (sh == 2 && sw == 2) { \ | |||||
f_struct.f = func<type, chl_mul, fh, fw, 2, 2>; \ | |||||
} else { \ | |||||
f_struct.f = func<type, chl_mul, fh, fw, 0, 0>; \ | |||||
} | } | ||||
#define GET_KERN(func, type) \ | #define GET_KERN(func, type) \ | ||||
FixFunction<type> f_struct; \ | |||||
if (param.chl_mul == 1) { \ | if (param.chl_mul == 1) { \ | ||||
if (param.flt_h == 3 && param.flt_w == 3) { \ | if (param.flt_h == 3 && param.flt_w == 3) { \ | ||||
SET_STRIDE(func, type, 1, 3, 3); \ | SET_STRIDE(func, type, 1, 3, 3); \ | ||||
@@ -447,36 +460,32 @@ __global__ void kern_bwd_data_hf(__half* src_grad, const __half* dst_grad, | |||||
} \ | } \ | ||||
} else { \ | } else { \ | ||||
SET_STRIDE(func, type, 0, 0, 0); \ | SET_STRIDE(func, type, 0, 0, 0); \ | ||||
} | |||||
} \ | |||||
return f_struct; | |||||
template <typename T> | |||||
struct FixFunction { | |||||
void (*f)(T*, const T*, const T*, const Param); | |||||
}; | |||||
template <typename T> | template <typename T> | ||||
void (*get_kern(const Param& param))(T*, const T*, const T*, const Param); | |||||
FixFunction<T> get_kern(const Param& param); | |||||
template <> | template <> | ||||
void (*get_kern<float>(const Param& param))(float*, const float*, const float*, | |||||
const Param) { | |||||
void (*kern_ptr)(float*, const float*, const float*, Param); | |||||
FixFunction<float> get_kern<float>(const Param& param) { | |||||
GET_KERN(kern_bwd_data_float, float); | GET_KERN(kern_bwd_data_float, float); | ||||
return kern_ptr; | |||||
} | } | ||||
#if CUDA_VERSION >= 9000 | #if CUDA_VERSION >= 9000 | ||||
template <> | template <> | ||||
void (*get_kern<__half>(const Param& param))(__half*, const __half*, | |||||
const __half*, const Param) { | |||||
void (*kern_ptr)(__half*, const __half*, const __half*, Param); | |||||
FixFunction<__half> get_kern<__half>(const Param& param) { | |||||
GET_KERN(kern_bwd_data_hf, __half); | GET_KERN(kern_bwd_data_hf, __half); | ||||
return kern_ptr; | |||||
} | } | ||||
#endif | #endif | ||||
template <> | template <> | ||||
void (*get_kern<dt_float16>(const Param& param))(dt_float16*, const dt_float16*, | |||||
const dt_float16*, | |||||
const Param) { | |||||
void (*kern_ptr)(dt_float16*, const dt_float16*, const dt_float16*, Param); | |||||
FixFunction<dt_float16> get_kern<dt_float16>(const Param& param) { | |||||
GET_KERN(kern_bwd_data_float, dt_float16); | GET_KERN(kern_bwd_data_float, dt_float16); | ||||
return kern_ptr; | |||||
} | } | ||||
#undef sh | #undef sh | ||||
@@ -494,7 +503,7 @@ template <typename T> | |||||
void run_bwd_data(T* src_grad, const T* dst_grad, const T* flt, | void run_bwd_data(T* src_grad, const T* dst_grad, const T* flt, | ||||
const Param& param, cudaStream_t stream) { | const Param& param, cudaStream_t stream) { | ||||
void (*kern)(T*, const T*, const T*, Param); | void (*kern)(T*, const T*, const T*, Param); | ||||
kern = get_kern<T>(param); | |||||
kern = get_kern<T>(param).f; | |||||
int nr_thread = query_blocksize_for_kernel(kern), | int nr_thread = query_blocksize_for_kernel(kern), | ||||
nr_out_dimx = param.src_h * param.src_w * param.batch; | nr_out_dimx = param.src_h * param.src_w * param.batch; | ||||
@@ -193,7 +193,8 @@ __global__ void kern_bwd_filter_hf( | |||||
return; | return; | ||||
} | } | ||||
sum2 = {0.0, 0.0}; | |||||
sum2.x = 0.0; | |||||
sum2.y = 0.0; | |||||
__half2 src2{0.0, 0.0}; | __half2 src2{0.0, 0.0}; | ||||
__half2 dst2{0.0, 0.0}; | __half2 dst2{0.0, 0.0}; | ||||
@@ -330,51 +331,74 @@ __global__ void kern_bwd_filter_hf( | |||||
} | } | ||||
#endif | #endif | ||||
#define GET_KERN(func, type) \ | |||||
switch(_p) { \ | |||||
case 1<<10: kern_ptr = func<type, 1<<10>; break; \ | |||||
case 1<<9: kern_ptr = func<type, 1<<9>; break; \ | |||||
case 1<<8: kern_ptr = func<type, 1<<8>; break; \ | |||||
case 1<<7: kern_ptr = func<type, 1<<7>; break; \ | |||||
case 1<<6: kern_ptr = func<type, 1<<6>; break; \ | |||||
case 1<<5: kern_ptr = func<type, 1<<5>; break; \ | |||||
case 1<<4: kern_ptr = func<type, 1<<4>; break; \ | |||||
case 1<<3: kern_ptr = func<type, 1<<3>; break; \ | |||||
case 1<<2: kern_ptr = func<type, 1<<2>; break; \ | |||||
case 1<<1: kern_ptr = func<type, 1<<1>; break; \ | |||||
case 1<<0: kern_ptr = func<type, 1<<0>; break; \ | |||||
} | |||||
#define GET_KERN(func, type) \ | |||||
FixFunction<type> f_struct; \ | |||||
switch (_p) { \ | |||||
case 1 << 10: \ | |||||
f_struct.f = func<type, 1 << 10>; \ | |||||
break; \ | |||||
case 1 << 9: \ | |||||
f_struct.f = func<type, 1 << 9>; \ | |||||
break; \ | |||||
case 1 << 8: \ | |||||
f_struct.f = func<type, 1 << 8>; \ | |||||
break; \ | |||||
case 1 << 7: \ | |||||
f_struct.f = func<type, 1 << 7>; \ | |||||
break; \ | |||||
case 1 << 6: \ | |||||
f_struct.f = func<type, 1 << 6>; \ | |||||
break; \ | |||||
case 1 << 5: \ | |||||
f_struct.f = func<type, 1 << 5>; \ | |||||
break; \ | |||||
case 1 << 4: \ | |||||
f_struct.f = func<type, 1 << 4>; \ | |||||
break; \ | |||||
case 1 << 3: \ | |||||
f_struct.f = func<type, 1 << 3>; \ | |||||
break; \ | |||||
case 1 << 2: \ | |||||
f_struct.f = func<type, 1 << 2>; \ | |||||
break; \ | |||||
case 1 << 1: \ | |||||
f_struct.f = func<type, 1 << 1>; \ | |||||
break; \ | |||||
case 1 << 0: \ | |||||
f_struct.f = func<type, 1 << 0>; \ | |||||
break; \ | |||||
default: \ | |||||
megdnn_assert(false, "DO NOT IMP CASE FUNCTION!!"); \ | |||||
} \ | |||||
return f_struct; | |||||
template <typename T> | |||||
struct FixFunction { | |||||
void (*f)(T*, const T*, const T*, Param); | |||||
}; | |||||
template <typename T> | template <typename T> | ||||
void (*get_kern(const uint32_t& _p))(T*, const T*, const T*, Param); | |||||
FixFunction<T> get_kern(const uint32_t& _p); | |||||
template <> | template <> | ||||
void (*get_kern<float>(const uint32_t& _p))(float*, const float*, const float*, Param) { | |||||
void (*kern_ptr)(float*, const float*, const float*, Param) = NULL; | |||||
GET_KERN(kern_bwd_filter_float, float); | |||||
return kern_ptr; | |||||
FixFunction<float> get_kern<float>(const uint32_t& _p) { | |||||
GET_KERN(kern_bwd_filter_float, float); | |||||
} | } | ||||
#if CUDA_VERSION >= 9000 | #if CUDA_VERSION >= 9000 | ||||
template <> | template <> | ||||
void (*get_kern<__half>(const uint32_t& _p))(__half*, const __half*, const __half*, Param) { | |||||
void (*kern_ptr)(__half*, const __half*, const __half*, Param) = NULL; | |||||
GET_KERN(kern_bwd_filter_hf, __half); | |||||
return kern_ptr; | |||||
FixFunction<__half> get_kern<__half>(const uint32_t& _p) { | |||||
GET_KERN(kern_bwd_filter_hf, __half); | |||||
} | } | ||||
#endif | #endif | ||||
template <> | template <> | ||||
void (*get_kern<dt_float16>(const uint32_t& _p))(dt_float16*, const dt_float16*, | |||||
const dt_float16*, Param) { | |||||
void (*kern_ptr)(dt_float16*, const dt_float16*, const dt_float16*, Param) = NULL; | |||||
FixFunction<dt_float16> get_kern<dt_float16>(const uint32_t& _p) { | |||||
GET_KERN(kern_bwd_filter_float, dt_float16); | GET_KERN(kern_bwd_filter_float, dt_float16); | ||||
return kern_ptr; | |||||
} | } | ||||
#undef GET_KERN | #undef GET_KERN | ||||
} // anonymous namespace | |||||
} // anonymous namespace | |||||
namespace megdnn { | namespace megdnn { | ||||
namespace cuda { | namespace cuda { | ||||
@@ -385,7 +409,7 @@ void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad, | |||||
const Param ¶m, cudaStream_t stream) { | const Param ¶m, cudaStream_t stream) { | ||||
void (*kern)(T*, const T*, const T*, Param) = NULL; | void (*kern)(T*, const T*, const T*, Param) = NULL; | ||||
uint32_t | uint32_t | ||||
nr_thread = query_blocksize_for_kernel(get_kern<T>(1024)), | |||||
nr_thread = query_blocksize_for_kernel(get_kern<T>(1024).f), | |||||
nr_thpf = std::min(nr_thread, | nr_thpf = std::min(nr_thread, | ||||
std::max<uint32_t>( | std::max<uint32_t>( | ||||
1, | 1, | ||||
@@ -395,7 +419,7 @@ void run_bwd_filter(T *filter_grad, const T *src, const T *dst_grad, | |||||
do { | do { | ||||
#define CK(_n) \ | #define CK(_n) \ | ||||
if (nr_thpf >= _n) { \ | if (nr_thpf >= _n) { \ | ||||
kern = get_kern<T>(_n); \ | |||||
kern = get_kern<T>(_n).f; \ | |||||
nr_thpf = _n; \ | nr_thpf = _n; \ | ||||
break; \ | break; \ | ||||
} | } | ||||
@@ -155,7 +155,7 @@ struct BlockTileIteratorBasic { | |||||
filter_gl2sh_visitor.copy(); | filter_gl2sh_visitor.copy(); | ||||
} | } | ||||
consumer.template consume_block(src_gl2sh_visitor, | |||||
consumer.consume_block(src_gl2sh_visitor, | |||||
filter_gl2sh_visitor); | filter_gl2sh_visitor); | ||||
if (!(ci_outer == ci_blks - 1 && h == h_end && | if (!(ci_outer == ci_blks - 1 && h == h_end && | ||||
@@ -171,7 +171,7 @@ struct BlockTileIterator_COxHW { | |||||
filter_gl2sh_visitor.copy(); | filter_gl2sh_visitor.copy(); | ||||
} | } | ||||
consumer.template consume_block(src_gl2sh_visitor, | |||||
consumer.consume_block(src_gl2sh_visitor, | |||||
filter_gl2sh_visitor); | filter_gl2sh_visitor); | ||||
if (!(ci_outer == ci_blks - 1 && f == filter_pixels - 1)) { | if (!(ci_outer == ci_blks - 1 && f == filter_pixels - 1)) { | ||||
@@ -162,7 +162,7 @@ struct BlockTileIteratorUnrollWidth { | |||||
filter_gl2sh_visitor.copy(); | filter_gl2sh_visitor.copy(); | ||||
} | } | ||||
consumer.template consume_block(src_gl2sh_visitor, | |||||
consumer.consume_block(src_gl2sh_visitor, | |||||
filter_gl2sh_visitor); | filter_gl2sh_visitor); | ||||
if (!(ci_outer == ci_blks - 1 && h == h_end && | if (!(ci_outer == ci_blks - 1 && h == h_end && | ||||
@@ -154,7 +154,7 @@ struct BlockTileIteratorUnrollWidthV2 { | |||||
filter_gl2sh_visitor.copy(); | filter_gl2sh_visitor.copy(); | ||||
} | } | ||||
consumer.template consume_block(src_gl2sh_visitor, | |||||
consumer.consume_block(src_gl2sh_visitor, | |||||
filter_gl2sh_visitor); | filter_gl2sh_visitor); | ||||
if (!(ci_outer == ci_blks - 1 && h == h_end)) { | if (!(ci_outer == ci_blks - 1 && h == h_end)) { | ||||
@@ -72,7 +72,7 @@ __global__ void convolution_kernel( | |||||
DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src}; | DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src}; | ||||
FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter}; | FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter}; | ||||
if (check_bounds) { | if (check_bounds) { | ||||
block_iterator.template set_remain(src_gl2sh_visitor, | |||||
block_iterator.set_remain(src_gl2sh_visitor, | |||||
filter_gl2sh_visitor); | filter_gl2sh_visitor); | ||||
} | } | ||||
@@ -89,7 +89,7 @@ __global__ void convolution_kernel( | |||||
GlobalMemoryWriter global_memory_writer; | GlobalMemoryWriter global_memory_writer; | ||||
global_memory_writer.init(smem_dst, alpha, beta); | global_memory_writer.init(smem_dst, alpha, beta); | ||||
if (check_bounds) { | if (check_bounds) { | ||||
block_iterator.template set_remain(global_memory_writer); | |||||
block_iterator.set_remain(global_memory_writer); | |||||
} | } | ||||
bias.move(block_iterator.block_batch, block_iterator.block_out_channel, | bias.move(block_iterator.block_batch, block_iterator.block_out_channel, | ||||
block_iterator.block_out_height, block_iterator.block_out_width); | block_iterator.block_out_height, block_iterator.block_out_width); | ||||
@@ -130,7 +130,7 @@ __global__ void convolution_kernel_precomp_offset( | |||||
DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src, offset}; | DataGlobal2ShareMemVisitor src_gl2sh_visitor{smem_src, offset}; | ||||
FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter}; | FilterGlobal2ShareMemVisitor filter_gl2sh_visitor{smem_filter}; | ||||
if (check_bounds) { | if (check_bounds) { | ||||
block_iterator.template set_remain(src_gl2sh_visitor, | |||||
block_iterator.set_remain(src_gl2sh_visitor, | |||||
filter_gl2sh_visitor); | filter_gl2sh_visitor); | ||||
} | } | ||||
@@ -147,7 +147,7 @@ __global__ void convolution_kernel_precomp_offset( | |||||
GlobalMemoryWriter global_memory_writer; | GlobalMemoryWriter global_memory_writer; | ||||
global_memory_writer.init(smem_dst, alpha, beta); | global_memory_writer.init(smem_dst, alpha, beta); | ||||
if (check_bounds) { | if (check_bounds) { | ||||
block_iterator.template set_remain(global_memory_writer); | |||||
block_iterator.set_remain(global_memory_writer); | |||||
} | } | ||||
bias.move(block_iterator.block_batch, block_iterator.block_out_channel, | bias.move(block_iterator.block_batch, block_iterator.block_out_channel, | ||||
block_iterator.block_out_height, block_iterator.block_out_width); | block_iterator.block_out_height, block_iterator.block_out_width); | ||||
@@ -259,8 +259,8 @@ void DeformablePSROIPoolForward(const TensorND& data, const TensorND& rois, | |||||
auto&& out_data_elems = out_data.layout.total_nr_elems(); | auto&& out_data_elems = out_data.layout.total_nr_elems(); | ||||
auto&& out_count_elems = out_count.layout.total_nr_elems(); | auto&& out_count_elems = out_count.layout.total_nr_elems(); | ||||
size_t out_data_bytes = sizeof(float[out_data_elems]); | |||||
size_t out_count_bytes = sizeof(float[out_count_elems]); | |||||
size_t out_data_bytes = sizeof(float) * out_data_elems; | |||||
size_t out_count_bytes = sizeof(float) * out_count_elems; | |||||
cudaMemsetAsync(out_data_ptr, 0, out_data_bytes, p.stream); | cudaMemsetAsync(out_data_ptr, 0, out_data_bytes, p.stream); | ||||
cudaMemsetAsync(out_count_ptr, 0, out_count_bytes, p.stream); | cudaMemsetAsync(out_count_ptr, 0, out_count_bytes, p.stream); | ||||
@@ -292,8 +292,8 @@ void DeformablePSROIPoolBackwardAcc(const TensorND& data, const TensorND& rois, | |||||
auto&& data_diff_elems = data_diff.layout.total_nr_elems(); | auto&& data_diff_elems = data_diff.layout.total_nr_elems(); | ||||
auto&& trans_diff_elems = trans_diff.layout.total_nr_elems(); | auto&& trans_diff_elems = trans_diff.layout.total_nr_elems(); | ||||
size_t data_diff_bytes = sizeof(float[data_diff_elems]); | |||||
size_t trans_diff_bytes = sizeof(float[trans_diff_elems]); | |||||
size_t data_diff_bytes = sizeof(float) * data_diff_elems; | |||||
size_t trans_diff_bytes = sizeof(float) * trans_diff_elems; | |||||
cudaMemsetAsync(data_diff_ptr, 0, data_diff_bytes, p.stream); | cudaMemsetAsync(data_diff_ptr, 0, data_diff_bytes, p.stream); | ||||
cudaMemsetAsync(trans_diff_ptr, 0, trans_diff_bytes, p.stream); | cudaMemsetAsync(trans_diff_ptr, 0, trans_diff_bytes, p.stream); | ||||
@@ -58,7 +58,7 @@ enum BcastType { | |||||
template <typename ctype> | template <typename ctype> | ||||
class VectTypeTrait; | class VectTypeTrait; | ||||
struct __attribute__((aligned(8))) half4 { | |||||
struct ATTR_ALIGNED(8) half4 { | |||||
dt_float16 x, y, z, w; | dt_float16 x, y, z, w; | ||||
}; | }; | ||||
@@ -69,7 +69,7 @@ __device__ __forceinline__ half4 make_half4(dt_float16 x, dt_float16 y, | |||||
return t; | return t; | ||||
} | } | ||||
struct __attribute__((aligned(8))) bhalf4 { | |||||
struct ATTR_ALIGNED(8) bhalf4 { | |||||
dt_bfloat16 x, y, z, w; | dt_bfloat16 x, y, z, w; | ||||
}; | }; | ||||
@@ -1,8 +1,8 @@ | |||||
# build support status | # build support status | ||||
## host build | ## host build | ||||
* windows build (ok) | |||||
* linux build (ok) | |||||
* macos build (ok) | |||||
* windows build (cpu + gpu) | |||||
* linux build (cpu + gpu) | |||||
* macos build (cpu only) | |||||
## cross build | ## cross build | ||||
* windows cross build arm-android (ok) | * windows cross build arm-android (ok) | ||||
* windows cross build arm-linux (ok) | * windows cross build arm-linux (ok) | ||||
@@ -17,9 +17,19 @@ | |||||
### windows host build | ### windows host build | ||||
``` | ``` | ||||
1: installl Visual Studio (need support LLVM/clang-cl), eg 2019 | 1: installl Visual Studio (need support LLVM/clang-cl), eg 2019 | ||||
clang-cl 9 linker have crash issue, pls install 7/8/10 | |||||
pls install LLVM-10, VS llvm linker have issue, pls replace lld-link.exe, | |||||
download from https://releases.llvm.org/download.html#10.0.0 | |||||
2: install extension of VS: python/cmake/LLVM | 2: install extension of VS: python/cmake/LLVM | ||||
3: CUDA env(if enable CUDA), version detail: project_root_dir/README.md | 3: CUDA env(if enable CUDA), version detail: project_root_dir/README.md | ||||
4: now we support cuda10.1+cudnn7.6+TensorRT6.0 on windows, as windows can | |||||
only use dll in fact with cudnn/TensorRT, so please install the same version; | |||||
4a: install cuda10.1 to C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 | |||||
4b: install cudnn7.6 to C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32 | |||||
4c: install TensorRT6.0 to C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5 | |||||
4d: add C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\bin to system Path env | |||||
4e: add C:\Program Files\NVIDIA GPU Computing Toolkit\cudnn-10.1-windows10-x64-v7.6.5.32\cuda\bin to system Path env | |||||
4f: add C:\Program Files\NVIDIA GPU Computing Toolkit\TensorRT-6.0.1.5\lib Path | |||||
if u do not do 4d/4e/4f, CUDA runtime can not find dll | |||||
``` | ``` | ||||
### linux host build | ### linux host build | ||||
``` | ``` | ||||
@@ -162,8 +162,22 @@ function prepare_env_for_windows_build() { | |||||
windows_env_err | windows_env_err | ||||
fi | fi | ||||
export PATH=$VS_PATH/VC/Auxiliary/Build:$PATH | |||||
echo "put vcvarsall.bat path to PATH env.." | echo "put vcvarsall.bat path to PATH env.." | ||||
export PATH=$VS_PATH/VC/Auxiliary/Build:$PATH | |||||
echo "config cuda/cudnn/TensorRT env..." | |||||
export NIVIDA_INSTALL_PRE=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit | |||||
export CUDA_V=v10.1 | |||||
export CUDNN_V=cudnn-10.1-windows10-x64-v7.6.5.32 | |||||
export TRT_V=TensorRT-6.0.1.5 | |||||
export CUDA_PATH=$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V} | |||||
export PATH=$PATH:$CUDA_PATH/bin | |||||
export CUDA_BIN_PATH=$CUDA_PATH | |||||
export PC_CUDNN_INCLUDE_DIRS=$NIVIDA_INSTALL_PRE/${CUDNN_V}/cuda/include | |||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$NIVIDA_INSTALL_PRE/${TRT_V}/lib:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/lib/x64:$NIVIDA_INSTALL_PRE/${CUDNN_V}/cuda/lib/x64 | |||||
export CPATH=$CPATH:$NIVIDA_INSTALL_PRE/${TRT_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include:$NIVIDA_INSTALL_PRE/CUDA/${CUDA_V}/include/nvtx3:$PC_CUDNN_INCLUDE_DIRS | |||||
export LIBRARY_PATH=$LIBRARY_PATH:$LD_LIBRARY_PATH | |||||
export INCLUDE=$INCLUDE:$CPATH | |||||
} | } | ||||
WINDOWS_BUILD_TARGET="Ninja all > build.log" | WINDOWS_BUILD_TARGET="Ninja all > build.log" | ||||
@@ -1,7 +1,7 @@ | |||||
# python whl package build support status | # python whl package build support status | ||||
* windows build (ok,cpu only) | |||||
* linux build (ok, cpu or gpu) | |||||
* macos build (ok,cpu only) | |||||
* windows build (cpu + gpu) | |||||
* linux build (cpu + gpu) | |||||
* macos build (cpu only) | |||||
# build env prepare | # build env prepare | ||||
## linux | ## linux | ||||
@@ -52,8 +52,10 @@ foreach (INCPATH IN LISTS MGB_INC) | |||||
endforeach() | endforeach() | ||||
if(MGE_WITH_CUDA) | if(MGE_WITH_CUDA) | ||||
target_compile_options(megbrain PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-parameter>" | |||||
"$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-unused-parameter>") | |||||
if(NOT WIN32 AND NOT MSVC) | |||||
target_compile_options(megbrain PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-parameter>" | |||||
"$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:-Wno-unused-parameter>") | |||||
endif() | |||||
else() | else() | ||||
target_compile_options(megbrain PRIVATE "-Wno-unused-parameter") | target_compile_options(megbrain PRIVATE "-Wno-unused-parameter") | ||||
endif() | endif() | ||||
@@ -195,12 +195,14 @@ public: | |||||
static void set_flag(int f) { flag() = f; } | static void set_flag(int f) { flag() = f; } | ||||
static void init() { | static void init() { | ||||
#if !defined(WIN32) | |||||
int err = pthread_atfork(&CudaCheckOnFork::atfork_prepare, nullptr, | int err = pthread_atfork(&CudaCheckOnFork::atfork_prepare, nullptr, | ||||
nullptr); | nullptr); | ||||
if (err) { | if (err) { | ||||
mgb_throw(SystemError, "failed to setup atfork handler: %s", | mgb_throw(SystemError, "failed to setup atfork handler: %s", | ||||
strerror(err)); | strerror(err)); | ||||
} | } | ||||
#endif | |||||
} | } | ||||
}; | }; | ||||
#endif | #endif | ||||