From 2d6827c168077f81578b2d7b74293504b98f3afc Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 1 Jun 2021 17:05:55 +0800 Subject: [PATCH] fix(mgb/windows): temporary workround on cuda-windows python exit code(127), as windows cuda driver unloading before atexit function may remove this after upgrade cuda runtime GitOrigin-RevId: cac37ca3ddc569e2a82185c6744da5c676042cc3 --- imperative/src/impl/physical_tensor.cpp | 33 +++++++++++++++++++++++++++ src/core/impl/comp_node/comp_node.cpp | 7 ++++++ src/core/impl/comp_node/cuda/comp_node.cpp | 36 +++++++++++++++++++++++++++--- src/core/impl/comp_node/cuda/comp_node.h | 7 ++++++ 4 files changed, 80 insertions(+), 3 deletions(-) diff --git a/imperative/src/impl/physical_tensor.cpp b/imperative/src/impl/physical_tensor.cpp index 5903b3dc..4261c4f0 100644 --- a/imperative/src/impl/physical_tensor.cpp +++ b/imperative/src/impl/physical_tensor.cpp @@ -26,6 +26,13 @@ class CompNodeSyncManager : public CompNodeDepedentObject { ThinHashMap> m_blob2event; std::mutex m_mtx; public: +#if MGB_CUDA && defined(WIN32) + //! FIXME: windows cuda driver shutdown before call atexit function even + //! register atexit function after init cuda driver! as a workround + //! recovery resource by OS temporarily, may need remove this after + //! upgrade cuda runtime + static bool is_into_atexit; +#endif std::shared_ptr on_comp_node_finalize() override { MGB_LOCK_GUARD(m_mtx); m_blob2event.clear(); @@ -34,6 +41,16 @@ public: static CompNodeSyncManager& inst() { static CompNodeSyncManager sl_inst; +#if MGB_CUDA && defined(WIN32) + //! FIXME: windows cuda driver shutdown before call atexit function even + //! register atexit function after init cuda driver! as a workround + //! recovery resource by OS temporarily, may need remove this after + //! upgrade cuda runtime + if (!is_into_atexit) { + auto err = atexit([] { is_into_atexit = true; }); + mgb_assert(!err, "failed to register atexit function"); + } +#endif return sl_inst; } @@ -52,6 +69,13 @@ public: m_blob2event.erase(blob); } }; +#if MGB_CUDA && defined(WIN32) +//! FIXME: windows cuda driver shutdown before call atexit function even +//! register atexit function after init cuda driver! as a workround +//! recovery resource by OS temporarily, may need remove this after +//! upgrade cuda runtime +bool CompNodeSyncManager::is_into_atexit = false; +#endif // Cache for small blobs // 1. A blob has to be seen twice (within a window) to be eligible for cache @@ -221,6 +245,15 @@ Blob::Blob(CompNode cn, size_t sz): Blob::~Blob() { BlobManager::inst()->unregister_blob(this); + +#if MGB_CUDA && defined(WIN32) + //! FIXME: windows cuda driver shutdown before call atexit function even + //! register atexit function after init cuda driver! as a workround + //! recovery resource by OS temporarily, may need remove this after + //! upgrade cuda runtime + if (CompNodeSyncManager::is_into_atexit) + return; +#endif CompNodeSyncManager::inst().remove(this); } diff --git a/src/core/impl/comp_node/comp_node.cpp b/src/core/impl/comp_node/comp_node.cpp index a9a17c41..d4bad933 100644 --- a/src/core/impl/comp_node/comp_node.cpp +++ b/src/core/impl/comp_node/comp_node.cpp @@ -556,6 +556,13 @@ CompNode CompNode::load(const Locator& locator_physical, } void CompNode::finalize() { +#if MGB_CUDA && defined(WIN32) + //! FIXME: windows cuda driver shutdown before call atexit function even + //! register atexit function after init cuda driver! as a workround recovery + //! resource by OS temporarily, may need remove this after upgrade cuda + //! runtime + return; +#endif comp_node_detail::DepedentObjList::invoke_callback_and_clean(); CudaCompNode::finalize(); CpuCompNode::finalize(); diff --git a/src/core/impl/comp_node/cuda/comp_node.cpp b/src/core/impl/comp_node/cuda/comp_node.cpp index 73e8dfb4..64279cc4 100644 --- a/src/core/impl/comp_node/cuda/comp_node.cpp +++ b/src/core/impl/comp_node/cuda/comp_node.cpp @@ -614,6 +614,18 @@ bool CudaCompNodeImpl::check_global_finalized() { } return true; } +#if MGB_CUDA && defined(WIN32) + //! FIXME: windows cuda driver shutdown before call atexit function even + //! register atexit function after init cuda driver! as a workround + //! recovery resource by OS temporarily, may need remove this after + //! upgrade cuda runtime + if (CudaCompNode::is_into_atexit) { + mgb_log_debug( + "windows cudaErrorCudartUnloading happened!!, resource " + "recovery by OS!!"); + return true; + } +#endif return false; } @@ -733,11 +745,29 @@ void CudaCompNode::finalize() { } } -CompNode::Impl* CudaCompNode::load_cuda( - const Locator &locator, const Locator &locator_logical) { +#if MGB_CUDA && defined(WIN32) +//! FIXME: windows cuda driver shutdown before call atexit function even +//! register atexit function after init cuda driver! as a workround +//! recovery resource by OS temporarily, may need remove this after +//! upgrade cuda runtime +bool CudaCompNode::is_into_atexit = false; +#endif +CompNode::Impl* CudaCompNode::load_cuda(const Locator& locator, + const Locator& locator_logical) { int nr_gpu = get_device_count(); +#if MGB_CUDA && defined(WIN32) + //! FIXME: windows cuda driver shutdown before call atexit function even + //! register atexit function after init cuda driver! as a workround + //! recovery resource by OS temporarily, may need remove this after + //! upgrade cuda runtime + if (!is_into_atexit) { + auto err = atexit([] { is_into_atexit = true; }); + mgb_assert(!err, "failed to register atexit function"); + } +#endif mgb_assert(locator.device >= 0 && locator.device < nr_gpu, - "request gpu%d out of valid range [0, %d)", locator.device, nr_gpu); + "request gpu%d out of valid range [0, %d)", locator.device, + nr_gpu); auto &&sdptr = CudaCompNodeImpl::sd; { diff --git a/src/core/impl/comp_node/cuda/comp_node.h b/src/core/impl/comp_node/cuda/comp_node.h index 6fe772fb..83e662d7 100644 --- a/src/core/impl/comp_node/cuda/comp_node.h +++ b/src/core/impl/comp_node/cuda/comp_node.h @@ -36,6 +36,13 @@ namespace mgb { static void set_prealloc_config(size_t alignment, size_t min_req, size_t max_overhead, double growth_factor); +#if MGB_CUDA && defined(WIN32) + //! FIXME: windows cuda driver shutdown before call atexit function + //! even register atexit function after init cuda driver! as a + //! workround recovery resource by OS temporarily, may need remove + //! this after upgrade cuda runtime + static bool is_into_atexit; +#endif }; }