code(127), as windows cuda driver unloading before atexit function
may remove this after upgrade cuda runtime
GitOrigin-RevId: cac37ca3dd
release-1.5
@@ -26,6 +26,13 @@ class CompNodeSyncManager : public CompNodeDepedentObject { | |||||
ThinHashMap<Blob*, std::unique_ptr<CompNode::Event>> m_blob2event; | ThinHashMap<Blob*, std::unique_ptr<CompNode::Event>> m_blob2event; | ||||
std::mutex m_mtx; | std::mutex m_mtx; | ||||
public: | public: | ||||
#if MGB_CUDA && defined(WIN32) | |||||
//! FIXME: windows cuda driver shutdown before call atexit function even | |||||
//! register atexit function after init cuda driver! as a workround | |||||
//! recovery resource by OS temporarily, may need remove this after | |||||
//! upgrade cuda runtime | |||||
static bool is_into_atexit; | |||||
#endif | |||||
std::shared_ptr<void> on_comp_node_finalize() override { | std::shared_ptr<void> on_comp_node_finalize() override { | ||||
MGB_LOCK_GUARD(m_mtx); | MGB_LOCK_GUARD(m_mtx); | ||||
m_blob2event.clear(); | m_blob2event.clear(); | ||||
@@ -34,6 +41,16 @@ public: | |||||
static CompNodeSyncManager& inst() { | static CompNodeSyncManager& inst() { | ||||
static CompNodeSyncManager sl_inst; | static CompNodeSyncManager sl_inst; | ||||
#if MGB_CUDA && defined(WIN32) | |||||
//! FIXME: windows cuda driver shutdown before call atexit function even | |||||
//! register atexit function after init cuda driver! as a workround | |||||
//! recovery resource by OS temporarily, may need remove this after | |||||
//! upgrade cuda runtime | |||||
if (!is_into_atexit) { | |||||
auto err = atexit([] { is_into_atexit = true; }); | |||||
mgb_assert(!err, "failed to register atexit function"); | |||||
} | |||||
#endif | |||||
return sl_inst; | return sl_inst; | ||||
} | } | ||||
@@ -52,6 +69,13 @@ public: | |||||
m_blob2event.erase(blob); | m_blob2event.erase(blob); | ||||
} | } | ||||
}; | }; | ||||
#if MGB_CUDA && defined(WIN32) | |||||
//! FIXME: windows cuda driver shutdown before call atexit function even | |||||
//! register atexit function after init cuda driver! as a workround | |||||
//! recovery resource by OS temporarily, may need remove this after | |||||
//! upgrade cuda runtime | |||||
bool CompNodeSyncManager::is_into_atexit = false; | |||||
#endif | |||||
// Cache for small blobs | // Cache for small blobs | ||||
// 1. A blob has to be seen twice (within a window) to be eligible for cache | // 1. A blob has to be seen twice (within a window) to be eligible for cache | ||||
@@ -221,6 +245,15 @@ Blob::Blob(CompNode cn, size_t sz): | |||||
Blob::~Blob() { | Blob::~Blob() { | ||||
BlobManager::inst()->unregister_blob(this); | BlobManager::inst()->unregister_blob(this); | ||||
#if MGB_CUDA && defined(WIN32) | |||||
//! FIXME: windows cuda driver shutdown before call atexit function even | |||||
//! register atexit function after init cuda driver! as a workround | |||||
//! recovery resource by OS temporarily, may need remove this after | |||||
//! upgrade cuda runtime | |||||
if (CompNodeSyncManager::is_into_atexit) | |||||
return; | |||||
#endif | |||||
CompNodeSyncManager::inst().remove(this); | CompNodeSyncManager::inst().remove(this); | ||||
} | } | ||||
@@ -556,6 +556,13 @@ CompNode CompNode::load(const Locator& locator_physical, | |||||
} | } | ||||
void CompNode::finalize() { | void CompNode::finalize() { | ||||
#if MGB_CUDA && defined(WIN32) | |||||
//! FIXME: windows cuda driver shutdown before call atexit function even | |||||
//! register atexit function after init cuda driver! as a workround recovery | |||||
//! resource by OS temporarily, may need remove this after upgrade cuda | |||||
//! runtime | |||||
return; | |||||
#endif | |||||
comp_node_detail::DepedentObjList::invoke_callback_and_clean(); | comp_node_detail::DepedentObjList::invoke_callback_and_clean(); | ||||
CudaCompNode::finalize(); | CudaCompNode::finalize(); | ||||
CpuCompNode::finalize(); | CpuCompNode::finalize(); | ||||
@@ -614,6 +614,18 @@ bool CudaCompNodeImpl::check_global_finalized() { | |||||
} | } | ||||
return true; | return true; | ||||
} | } | ||||
#if MGB_CUDA && defined(WIN32) | |||||
//! FIXME: windows cuda driver shutdown before call atexit function even | |||||
//! register atexit function after init cuda driver! as a workround | |||||
//! recovery resource by OS temporarily, may need remove this after | |||||
//! upgrade cuda runtime | |||||
if (CudaCompNode::is_into_atexit) { | |||||
mgb_log_debug( | |||||
"windows cudaErrorCudartUnloading happened!!, resource " | |||||
"recovery by OS!!"); | |||||
return true; | |||||
} | |||||
#endif | |||||
return false; | return false; | ||||
} | } | ||||
@@ -733,11 +745,29 @@ void CudaCompNode::finalize() { | |||||
} | } | ||||
} | } | ||||
CompNode::Impl* CudaCompNode::load_cuda( | |||||
const Locator &locator, const Locator &locator_logical) { | |||||
#if MGB_CUDA && defined(WIN32) | |||||
//! FIXME: windows cuda driver shutdown before call atexit function even | |||||
//! register atexit function after init cuda driver! as a workround | |||||
//! recovery resource by OS temporarily, may need remove this after | |||||
//! upgrade cuda runtime | |||||
bool CudaCompNode::is_into_atexit = false; | |||||
#endif | |||||
CompNode::Impl* CudaCompNode::load_cuda(const Locator& locator, | |||||
const Locator& locator_logical) { | |||||
int nr_gpu = get_device_count(); | int nr_gpu = get_device_count(); | ||||
#if MGB_CUDA && defined(WIN32) | |||||
//! FIXME: windows cuda driver shutdown before call atexit function even | |||||
//! register atexit function after init cuda driver! as a workround | |||||
//! recovery resource by OS temporarily, may need remove this after | |||||
//! upgrade cuda runtime | |||||
if (!is_into_atexit) { | |||||
auto err = atexit([] { is_into_atexit = true; }); | |||||
mgb_assert(!err, "failed to register atexit function"); | |||||
} | |||||
#endif | |||||
mgb_assert(locator.device >= 0 && locator.device < nr_gpu, | mgb_assert(locator.device >= 0 && locator.device < nr_gpu, | ||||
"request gpu%d out of valid range [0, %d)", locator.device, nr_gpu); | |||||
"request gpu%d out of valid range [0, %d)", locator.device, | |||||
nr_gpu); | |||||
auto &&sdptr = CudaCompNodeImpl::sd; | auto &&sdptr = CudaCompNodeImpl::sd; | ||||
{ | { | ||||
@@ -36,6 +36,13 @@ namespace mgb { | |||||
static void set_prealloc_config(size_t alignment, size_t min_req, | static void set_prealloc_config(size_t alignment, size_t min_req, | ||||
size_t max_overhead, double growth_factor); | size_t max_overhead, double growth_factor); | ||||
#if MGB_CUDA && defined(WIN32) | |||||
//! FIXME: windows cuda driver shutdown before call atexit function | |||||
//! even register atexit function after init cuda driver! as a | |||||
//! workround recovery resource by OS temporarily, may need remove | |||||
//! this after upgrade cuda runtime | |||||
static bool is_into_atexit; | |||||
#endif | |||||
}; | }; | ||||
} | } | ||||