GitOrigin-RevId: b0ad639921
tags/v1.3.1
@@ -40,7 +40,8 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | |||||
option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | ||||
option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) | option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) | ||||
option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) | option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) | ||||
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF) | |||||
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." ON) | |||||
option(MGE_WITH_CUBLAS_SHARED "Build MegEngine with CUBLAS shared." OFF) | |||||
option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | ||||
option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | ||||
option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | ||||
@@ -60,6 +61,11 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF) | |||||
option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) | option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) | ||||
if(MSVC OR WIN32) | |||||
message(STATUS "windows force cudnn static link") | |||||
set(MGE_WITH_CUDNN_SHARED OFF) | |||||
endif() | |||||
if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) | if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) | ||||
set(MGE_WITH_ANY_CUDA_STUB ON) | set(MGE_WITH_ANY_CUDA_STUB ON) | ||||
else() | else() | ||||
@@ -472,15 +478,28 @@ if(MGE_WITH_CUDA) | |||||
endif() | endif() | ||||
endif() | endif() | ||||
if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | |||||
list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib) | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static) | |||||
endif() | |||||
if(MSVC OR WIN32) | |||||
list(APPEND MGE_CUDA_LIBS cublas.lib) | |||||
else() | else() | ||||
list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) | |||||
if(MGE_WITH_CUBLAS_SHARED) | |||||
list(APPEND MGE_CUDA_LIBS cublas) | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS cublas_static) | |||||
endif() | |||||
endif() | endif() | ||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | ||||
if(MSVC OR WIN32) | if(MSVC OR WIN32) | ||||
list(APPEND MGE_CUDA_LIBS cublasLt.lib) | list(APPEND MGE_CUDA_LIBS cublasLt.lib) | ||||
else() | else() | ||||
list(APPEND MGE_CUDA_LIBS cublasLt_static) | |||||
if(MGE_WITH_CUBLAS_SHARED) | |||||
list(APPEND MGE_CUDA_LIBS cublasLt) | |||||
else() | |||||
list(APPEND MGE_CUDA_LIBS cublasLt_static) | |||||
endif() | |||||
endif() | endif() | ||||
endif() | endif() | ||||
if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) | if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) | ||||
@@ -55,6 +55,12 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle): | |||||
megdnn_assert(cublasLtGetVersion() >= 10010, | megdnn_assert(cublasLtGetVersion() >= 10010, | ||||
"cuda library version is too low to run cublasLt"); | "cuda library version is too low to run cublasLt"); | ||||
#endif | #endif | ||||
#if CUDNN_VERSION >= 8000 | |||||
megdnn_log_warn(R"( | |||||
Cudnn8 will jit ptx code with cache. You can set | |||||
CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow). | |||||
For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)"); | |||||
#endif | |||||
cudnn_check(cudnnCreate(&m_cudnn_handle)); | cudnn_check(cudnnCreate(&m_cudnn_handle)); | ||||
cublas_check(cublasCreate(&m_cublas_handle)); | cublas_check(cublasCreate(&m_cublas_handle)); | ||||
#if CUDA_VERSION >= 10010 | #if CUDA_VERSION >= 10010 | ||||
@@ -199,4 +199,4 @@ def test_dp_correctness(): | |||||
model_name = "mnist_model_with_test.mge" | model_name = "mnist_model_with_test.mge" | ||||
model_path = os.path.join(os.path.dirname(__file__), model_name) | model_path = os.path.join(os.path.dirname(__file__), model_name) | ||||
set_execution_strategy("HEURISTIC_REPRODUCIBLE") | set_execution_strategy("HEURISTIC_REPRODUCIBLE") | ||||
run_test(model_path, False, False, max_err=1e-5) | |||||
run_test(model_path, False, False, max_err=5e-5) |
@@ -22,7 +22,7 @@ from megengine.utils.comp_graph_tools import GraphInference | |||||
from megengine.utils.network import Network as Net | from megengine.utils.network import Network as Net | ||||
def check_pygraph_dump(trace_func, inp_data, expect_results): | |||||
def check_pygraph_dump(trace_func, inp_data, expect_results, max_err=None): | |||||
orig_model = io.BytesIO() | orig_model = io.BytesIO() | ||||
inp_size = len(inp_data) | inp_size = len(inp_data) | ||||
out_size = len(expect_results) | out_size = len(expect_results) | ||||
@@ -46,7 +46,12 @@ def check_pygraph_dump(trace_func, inp_data, expect_results): | |||||
results = graph.run(inp_dict=inp_dict) | results = graph.run(inp_dict=inp_dict) | ||||
for ind, tensor in enumerate(expect_results): | for ind, tensor in enumerate(expect_results): | ||||
np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) | |||||
if max_err: | |||||
np.testing.assert_almost_equal( | |||||
tensor.numpy(), results[output_names[ind]], max_err | |||||
) | |||||
else: | |||||
np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) | |||||
assert tensor.dtype == results[output_names[ind]].dtype | assert tensor.dtype == results[output_names[ind]].dtype | ||||
@@ -178,7 +183,8 @@ def test_convtranspose(): | |||||
data = Tensor(np.random.random((1, 32, 32, 32))) | data = Tensor(np.random.random((1, 32, 32, 32))) | ||||
result = fwd(data) | result = fwd(data) | ||||
check_pygraph_dump(fwd, [data], [result]) | |||||
# cu111 has 1e-7 diff | |||||
check_pygraph_dump(fwd, [data], [result], 5) | |||||
@pytest.mark.skip(reason="pytest aborted") | @pytest.mark.skip(reason="pytest aborted") | ||||
@@ -31,7 +31,7 @@ echo "Build with ${SDK_NAME}" | |||||
if [ $SDK_NAME == "cu101" ];then | if [ $SDK_NAME == "cu101" ];then | ||||
CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | ||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" | |||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||||
BUILD_GCC8="ON" | BUILD_GCC8="ON" | ||||
REQUIR_CUDA_VERSION="10010" | REQUIR_CUDA_VERSION="10010" | ||||
REQUIR_CUDNN_VERSION="7.6.3" | REQUIR_CUDNN_VERSION="7.6.3" | ||||
@@ -49,7 +49,7 @@ elif [ $SDK_NAME == "cu111" ];then | |||||
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | ||||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ||||
${CUDNN_LIB_DIR}/libcudnn.so.8" | ${CUDNN_LIB_DIR}/libcudnn.so.8" | ||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\ | |||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \ | |||||
-gencode arch=compute_61,code=sm_61 \ | -gencode arch=compute_61,code=sm_61 \ | ||||
arch=compute_70,code=sm_70 \ | arch=compute_70,code=sm_70 \ | ||||
arch=compute_75,code=sm_75 \ | arch=compute_75,code=sm_75 \ | ||||
@@ -72,7 +72,7 @@ elif [ $SDK_NAME == "cu112" ];then | |||||
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | ||||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | ||||
${CUDNN_LIB_DIR}/libcudnn.so.8" | ${CUDNN_LIB_DIR}/libcudnn.so.8" | ||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \ | |||||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \ | |||||
-gencode arch=compute_61,code=sm_61 \ | -gencode arch=compute_61,code=sm_61 \ | ||||
arch=compute_70,code=sm_70 \ | arch=compute_70,code=sm_70 \ | ||||
arch=compute_75,code=sm_75 \ | arch=compute_75,code=sm_75 \ | ||||
@@ -214,6 +214,8 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node, | |||||
mgb_assert( | mgb_assert( | ||||
m_property.mem_alignment == | m_property.mem_alignment == | ||||
MegDNNHandle::get(*this).handle()->alignment_requirement()); | MegDNNHandle::get(*this).handle()->alignment_requirement()); | ||||
auto err = atexit(&CompNode::finalize); | |||||
mgb_assert(!err, "failed to register CompNode::finalize at exit"); | |||||
} | } | ||||
MGB_CATCH(std::exception & exc, { | MGB_CATCH(std::exception & exc, { | ||||
mgb_log_error("async cuda init failed: %s", exc.what()); | mgb_log_error("async cuda init failed: %s", exc.what()); | ||||
@@ -304,6 +306,8 @@ void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node, | |||||
mgb_assert( | mgb_assert( | ||||
m_property.mem_alignment == | m_property.mem_alignment == | ||||
MegDNNHandle::get(*this).handle()->alignment_requirement()); | MegDNNHandle::get(*this).handle()->alignment_requirement()); | ||||
auto err = atexit(&CompNode::finalize); | |||||
mgb_assert(!err, "failed to register CompNode::finalize at exit"); | |||||
} | } | ||||
MGB_CATCH(std::exception & exc, { | MGB_CATCH(std::exception & exc, { | ||||
mgb_log_error("async rocm init failed: %s", exc.what()); | mgb_log_error("async rocm init failed: %s", exc.what()); | ||||
@@ -1850,8 +1850,6 @@ TEST(TestEnableTensorCore, SmallInputShape) { | |||||
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | ||||
} | } | ||||
//! close for cu111 ci, reopen it when bug fixed | |||||
#if CUDA_VERSION < 11000 | |||||
TEST(TestEnableTensorCore, Nchw4Nchw) { | TEST(TestEnableTensorCore, Nchw4Nchw) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||
auto cn = CompNode::load("gpu0"); | auto cn = CompNode::load("gpu0"); | ||||
@@ -1957,7 +1955,6 @@ TEST(TestEnableTensorCore, Nchw4Nchw) { | |||||
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | ||||
} | } | ||||
} | } | ||||
#endif | |||||
TEST(TestEnableTensorCore, ConvBiasWithZ) { | TEST(TestEnableTensorCore, ConvBiasWithZ) { | ||||
REQUIRE_GPU(1); | REQUIRE_GPU(1); | ||||