GitOrigin-RevId: b0ad639921
tags/v1.3.1
@@ -40,7 +40,8 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON) | |||
option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON) | |||
option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON) | |||
option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF) | |||
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF) | |||
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." ON) | |||
option(MGE_WITH_CUBLAS_SHARED "Build MegEngine with CUBLAS shared." OFF) | |||
option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF) | |||
option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON) | |||
option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF) | |||
@@ -60,6 +61,11 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF) | |||
option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF) | |||
if(MSVC OR WIN32) | |||
message(STATUS "windows force cudnn static link") | |||
set(MGE_WITH_CUDNN_SHARED OFF) | |||
endif() | |||
if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB) | |||
set(MGE_WITH_ANY_CUDA_STUB ON) | |||
else() | |||
@@ -472,15 +478,28 @@ if(MGE_WITH_CUDA) | |||
endif() | |||
endif() | |||
if(MSVC OR WIN32) | |||
list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib) | |||
list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static) | |||
endif() | |||
if(MSVC OR WIN32) | |||
list(APPEND MGE_CUDA_LIBS cublas.lib) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static) | |||
if(MGE_WITH_CUBLAS_SHARED) | |||
list(APPEND MGE_CUDA_LIBS cublas) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS cublas_static) | |||
endif() | |||
endif() | |||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0") | |||
if(MSVC OR WIN32) | |||
list(APPEND MGE_CUDA_LIBS cublasLt.lib) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS cublasLt_static) | |||
if(MGE_WITH_CUBLAS_SHARED) | |||
list(APPEND MGE_CUDA_LIBS cublasLt) | |||
else() | |||
list(APPEND MGE_CUDA_LIBS cublasLt_static) | |||
endif() | |||
endif() | |||
endif() | |||
if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32) | |||
@@ -55,6 +55,12 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle): | |||
megdnn_assert(cublasLtGetVersion() >= 10010, | |||
"cuda library version is too low to run cublasLt"); | |||
#endif | |||
#if CUDNN_VERSION >= 8000 | |||
megdnn_log_warn(R"( | |||
Cudnn8 will jit ptx code with cache. You can set | |||
CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow). | |||
For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)"); | |||
#endif | |||
cudnn_check(cudnnCreate(&m_cudnn_handle)); | |||
cublas_check(cublasCreate(&m_cublas_handle)); | |||
#if CUDA_VERSION >= 10010 | |||
@@ -199,4 +199,4 @@ def test_dp_correctness(): | |||
model_name = "mnist_model_with_test.mge" | |||
model_path = os.path.join(os.path.dirname(__file__), model_name) | |||
set_execution_strategy("HEURISTIC_REPRODUCIBLE") | |||
run_test(model_path, False, False, max_err=1e-5) | |||
run_test(model_path, False, False, max_err=5e-5) |
@@ -22,7 +22,7 @@ from megengine.utils.comp_graph_tools import GraphInference | |||
from megengine.utils.network import Network as Net | |||
def check_pygraph_dump(trace_func, inp_data, expect_results): | |||
def check_pygraph_dump(trace_func, inp_data, expect_results, max_err=None): | |||
orig_model = io.BytesIO() | |||
inp_size = len(inp_data) | |||
out_size = len(expect_results) | |||
@@ -46,7 +46,12 @@ def check_pygraph_dump(trace_func, inp_data, expect_results): | |||
results = graph.run(inp_dict=inp_dict) | |||
for ind, tensor in enumerate(expect_results): | |||
np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) | |||
if max_err: | |||
np.testing.assert_almost_equal( | |||
tensor.numpy(), results[output_names[ind]], max_err | |||
) | |||
else: | |||
np.testing.assert_equal(tensor.numpy(), results[output_names[ind]]) | |||
assert tensor.dtype == results[output_names[ind]].dtype | |||
@@ -178,7 +183,8 @@ def test_convtranspose(): | |||
data = Tensor(np.random.random((1, 32, 32, 32))) | |||
result = fwd(data) | |||
check_pygraph_dump(fwd, [data], [result]) | |||
# cu111 has 1e-7 diff | |||
check_pygraph_dump(fwd, [data], [result], 5) | |||
@pytest.mark.skip(reason="pytest aborted") | |||
@@ -31,7 +31,7 @@ echo "Build with ${SDK_NAME}" | |||
if [ $SDK_NAME == "cu101" ];then | |||
CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1" | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF" | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF" | |||
BUILD_GCC8="ON" | |||
REQUIR_CUDA_VERSION="10010" | |||
REQUIR_CUDNN_VERSION="7.6.3" | |||
@@ -49,7 +49,7 @@ elif [ $SDK_NAME == "cu111" ];then | |||
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\ | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \ | |||
-gencode arch=compute_61,code=sm_61 \ | |||
arch=compute_70,code=sm_70 \ | |||
arch=compute_75,code=sm_75 \ | |||
@@ -72,7 +72,7 @@ elif [ $SDK_NAME == "cu112" ];then | |||
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\ | |||
${CUDNN_LIB_DIR}/libcudnn.so.8" | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \ | |||
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \ | |||
-gencode arch=compute_61,code=sm_61 \ | |||
arch=compute_70,code=sm_70 \ | |||
arch=compute_75,code=sm_75 \ | |||
@@ -214,6 +214,8 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node, | |||
mgb_assert( | |||
m_property.mem_alignment == | |||
MegDNNHandle::get(*this).handle()->alignment_requirement()); | |||
auto err = atexit(&CompNode::finalize); | |||
mgb_assert(!err, "failed to register CompNode::finalize at exit"); | |||
} | |||
MGB_CATCH(std::exception & exc, { | |||
mgb_log_error("async cuda init failed: %s", exc.what()); | |||
@@ -304,6 +306,8 @@ void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node, | |||
mgb_assert( | |||
m_property.mem_alignment == | |||
MegDNNHandle::get(*this).handle()->alignment_requirement()); | |||
auto err = atexit(&CompNode::finalize); | |||
mgb_assert(!err, "failed to register CompNode::finalize at exit"); | |||
} | |||
MGB_CATCH(std::exception & exc, { | |||
mgb_log_error("async rocm init failed: %s", exc.what()); | |||
@@ -1850,8 +1850,6 @@ TEST(TestEnableTensorCore, SmallInputShape) { | |||
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | |||
} | |||
//! close for cu111 ci, reopen it when bug fixed | |||
#if CUDA_VERSION < 11000 | |||
TEST(TestEnableTensorCore, Nchw4Nchw) { | |||
REQUIRE_GPU(1); | |||
auto cn = CompNode::load("gpu0"); | |||
@@ -1957,7 +1955,6 @@ TEST(TestEnableTensorCore, Nchw4Nchw) { | |||
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt); | |||
} | |||
} | |||
#endif | |||
TEST(TestEnableTensorCore, ConvBiasWithZ) { | |||
REQUIRE_GPU(1); | |||