Browse Source

fix(dnn): fix cudnn crash when finalize called after cudnn dtor

GitOrigin-RevId: b0ad639921
tags/v1.3.1
Megvii Engine Team 4 years ago
parent
commit
04b1a45af4
7 changed files with 46 additions and 14 deletions
  1. +23
    -4
      CMakeLists.txt
  2. +6
    -0
      dnn/src/cuda/handle.cpp
  3. +1
    -1
      imperative/python/test/integration/test_dp_correctness.py
  4. +9
    -3
      imperative/python/test/unit/utils/test_network_node.py
  5. +3
    -3
      scripts/whl/manylinux2014/build_wheel_common.sh
  6. +4
    -0
      src/core/impl/comp_node_env.cpp
  7. +0
    -3
      src/gopt/test/inference.cpp

+ 23
- 4
CMakeLists.txt View File

@@ -40,7 +40,8 @@ option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON)
option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON)
option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON)
option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF)
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF)
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." ON)
option(MGE_WITH_CUBLAS_SHARED "Build MegEngine with CUBLAS shared." OFF)
option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF)
option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON)
option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF)
@@ -60,6 +61,11 @@ option(MGE_WITH_ROCM "Enable ROCM support" OFF)
option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF)


if(MSVC OR WIN32)
message(STATUS "windows force cudnn static link")
set(MGE_WITH_CUDNN_SHARED OFF)
endif()

if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB)
set(MGE_WITH_ANY_CUDA_STUB ON)
else()
@@ -472,15 +478,28 @@ if(MGE_WITH_CUDA)
endif()
endif()
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib)
list(APPEND MGE_CUDA_LIBS cusolver.lib curand.lib cudart_static.lib cusparse.lib)
else()
list(APPEND MGE_CUDA_LIBS cusolver_static curand_static culibos cudart_static cusparse_static)
endif()
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cublas.lib)
else()
list(APPEND MGE_CUDA_LIBS cusolver_static cublas_static curand_static culibos cudart_static cusparse_static)
if(MGE_WITH_CUBLAS_SHARED)
list(APPEND MGE_CUDA_LIBS cublas)
else()
list(APPEND MGE_CUDA_LIBS cublas_static)
endif()
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cublasLt.lib)
else()
list(APPEND MGE_CUDA_LIBS cublasLt_static)
if(MGE_WITH_CUBLAS_SHARED)
list(APPEND MGE_CUDA_LIBS cublasLt)
else()
list(APPEND MGE_CUDA_LIBS cublasLt_static)
endif()
endif()
endif()
if((${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0") AND NOT MSVC AND NOT WIN32)


+ 6
- 0
dnn/src/cuda/handle.cpp View File

@@ -55,6 +55,12 @@ HandleImpl::HandleImpl(megcoreComputingHandle_t comp_handle):
megdnn_assert(cublasLtGetVersion() >= 10010,
"cuda library version is too low to run cublasLt");
#endif
#if CUDNN_VERSION >= 8000
megdnn_log_warn(R"(
Cudnn8 will jit ptx code with cache. You can set
CUDA_CACHE_MAXSIZE and CUDA_CACHE_PATH environment var to avoid repeat jit(very slow).
For example `export CUDA_CACHE_MAXSIZE=2147483647` and `export CUDA_CACHE_PATH=/data/.cuda_cache`)");
#endif
cudnn_check(cudnnCreate(&m_cudnn_handle));
cublas_check(cublasCreate(&m_cublas_handle));
#if CUDA_VERSION >= 10010


+ 1
- 1
imperative/python/test/integration/test_dp_correctness.py View File

@@ -199,4 +199,4 @@ def test_dp_correctness():
model_name = "mnist_model_with_test.mge"
model_path = os.path.join(os.path.dirname(__file__), model_name)
set_execution_strategy("HEURISTIC_REPRODUCIBLE")
run_test(model_path, False, False, max_err=1e-5)
run_test(model_path, False, False, max_err=5e-5)

+ 9
- 3
imperative/python/test/unit/utils/test_network_node.py View File

@@ -22,7 +22,7 @@ from megengine.utils.comp_graph_tools import GraphInference
from megengine.utils.network import Network as Net


def check_pygraph_dump(trace_func, inp_data, expect_results):
def check_pygraph_dump(trace_func, inp_data, expect_results, max_err=None):
orig_model = io.BytesIO()
inp_size = len(inp_data)
out_size = len(expect_results)
@@ -46,7 +46,12 @@ def check_pygraph_dump(trace_func, inp_data, expect_results):
results = graph.run(inp_dict=inp_dict)

for ind, tensor in enumerate(expect_results):
np.testing.assert_equal(tensor.numpy(), results[output_names[ind]])
if max_err:
np.testing.assert_almost_equal(
tensor.numpy(), results[output_names[ind]], max_err
)
else:
np.testing.assert_equal(tensor.numpy(), results[output_names[ind]])
assert tensor.dtype == results[output_names[ind]].dtype


@@ -178,7 +183,8 @@ def test_convtranspose():

data = Tensor(np.random.random((1, 32, 32, 32)))
result = fwd(data)
check_pygraph_dump(fwd, [data], [result])
# cu111 has 1e-7 diff
check_pygraph_dump(fwd, [data], [result], 5)


@pytest.mark.skip(reason="pytest aborted")


+ 3
- 3
scripts/whl/manylinux2014/build_wheel_common.sh View File

@@ -31,7 +31,7 @@ echo "Build with ${SDK_NAME}"

if [ $SDK_NAME == "cu101" ];then
CUDA_COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF -DMGE_WITH_CUBLAS_SHARED=OFF"
BUILD_GCC8="ON"
REQUIR_CUDA_VERSION="10010"
REQUIR_CUDNN_VERSION="7.6.3"
@@ -49,7 +49,7 @@ elif [ $SDK_NAME == "cu111" ];then
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn.so.8"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \
-gencode arch=compute_61,code=sm_61 \
arch=compute_70,code=sm_70 \
arch=compute_75,code=sm_75 \
@@ -72,7 +72,7 @@ elif [ $SDK_NAME == "cu112" ];then
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn.so.8"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON -DMGE_WITH_CUBLAS_SHARED=ON \
-gencode arch=compute_61,code=sm_61 \
arch=compute_70,code=sm_70 \
arch=compute_75,code=sm_75 \


+ 4
- 0
src/core/impl/comp_node_env.cpp View File

@@ -214,6 +214,8 @@ void CompNodeEnv::init_cuda_async(int dev, CompNode comp_node,
mgb_assert(
m_property.mem_alignment ==
MegDNNHandle::get(*this).handle()->alignment_requirement());
auto err = atexit(&CompNode::finalize);
mgb_assert(!err, "failed to register CompNode::finalize at exit");
}
MGB_CATCH(std::exception & exc, {
mgb_log_error("async cuda init failed: %s", exc.what());
@@ -304,6 +306,8 @@ void CompNodeEnv::init_rocm_async(int dev, CompNode comp_node,
mgb_assert(
m_property.mem_alignment ==
MegDNNHandle::get(*this).handle()->alignment_requirement());
auto err = atexit(&CompNode::finalize);
mgb_assert(!err, "failed to register CompNode::finalize at exit");
}
MGB_CATCH(std::exception & exc, {
mgb_log_error("async rocm init failed: %s", exc.what());


+ 0
- 3
src/gopt/test/inference.cpp View File

@@ -1850,8 +1850,6 @@ TEST(TestEnableTensorCore, SmallInputShape) {
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
}

//! close for cu111 ci, reopen it when bug fixed
#if CUDA_VERSION < 11000
TEST(TestEnableTensorCore, Nchw4Nchw) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
@@ -1957,7 +1955,6 @@ TEST(TestEnableTensorCore, Nchw4Nchw) {
MGB_ASSERT_TENSOR_EQ(host_y, host_y_opt);
}
}
#endif

TEST(TestEnableTensorCore, ConvBiasWithZ) {
REQUIRE_GPU(1);


Loading…
Cancel
Save