Browse Source

fix(mgb/windows): temporary workround on cuda-windows python exit

code(127), as windows cuda driver unloading before atexit function
may remove this after upgrade cuda runtime

GitOrigin-RevId: cac37ca3dd
release-1.5
Megvii Engine Team 4 years ago
parent
commit
2d6827c168
4 changed files with 80 additions and 3 deletions
  1. +33
    -0
      imperative/src/impl/physical_tensor.cpp
  2. +7
    -0
      src/core/impl/comp_node/comp_node.cpp
  3. +33
    -3
      src/core/impl/comp_node/cuda/comp_node.cpp
  4. +7
    -0
      src/core/impl/comp_node/cuda/comp_node.h

+ 33
- 0
imperative/src/impl/physical_tensor.cpp View File

@@ -26,6 +26,13 @@ class CompNodeSyncManager : public CompNodeDepedentObject {
ThinHashMap<Blob*, std::unique_ptr<CompNode::Event>> m_blob2event; ThinHashMap<Blob*, std::unique_ptr<CompNode::Event>> m_blob2event;
std::mutex m_mtx; std::mutex m_mtx;
public: public:
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
static bool is_into_atexit;
#endif
std::shared_ptr<void> on_comp_node_finalize() override { std::shared_ptr<void> on_comp_node_finalize() override {
MGB_LOCK_GUARD(m_mtx); MGB_LOCK_GUARD(m_mtx);
m_blob2event.clear(); m_blob2event.clear();
@@ -34,6 +41,16 @@ public:


static CompNodeSyncManager& inst() { static CompNodeSyncManager& inst() {
static CompNodeSyncManager sl_inst; static CompNodeSyncManager sl_inst;
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (!is_into_atexit) {
auto err = atexit([] { is_into_atexit = true; });
mgb_assert(!err, "failed to register atexit function");
}
#endif
return sl_inst; return sl_inst;
} }


@@ -52,6 +69,13 @@ public:
m_blob2event.erase(blob); m_blob2event.erase(blob);
} }
}; };
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
bool CompNodeSyncManager::is_into_atexit = false;
#endif


// Cache for small blobs // Cache for small blobs
// 1. A blob has to be seen twice (within a window) to be eligible for cache // 1. A blob has to be seen twice (within a window) to be eligible for cache
@@ -221,6 +245,15 @@ Blob::Blob(CompNode cn, size_t sz):


Blob::~Blob() { Blob::~Blob() {
BlobManager::inst()->unregister_blob(this); BlobManager::inst()->unregister_blob(this);

#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (CompNodeSyncManager::is_into_atexit)
return;
#endif
CompNodeSyncManager::inst().remove(this); CompNodeSyncManager::inst().remove(this);
} }




+ 7
- 0
src/core/impl/comp_node/comp_node.cpp View File

@@ -556,6 +556,13 @@ CompNode CompNode::load(const Locator& locator_physical,
} }


void CompNode::finalize() { void CompNode::finalize() {
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround recovery
//! resource by OS temporarily, may need remove this after upgrade cuda
//! runtime
return;
#endif
comp_node_detail::DepedentObjList::invoke_callback_and_clean(); comp_node_detail::DepedentObjList::invoke_callback_and_clean();
CudaCompNode::finalize(); CudaCompNode::finalize();
CpuCompNode::finalize(); CpuCompNode::finalize();


+ 33
- 3
src/core/impl/comp_node/cuda/comp_node.cpp View File

@@ -614,6 +614,18 @@ bool CudaCompNodeImpl::check_global_finalized() {
} }
return true; return true;
} }
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (CudaCompNode::is_into_atexit) {
mgb_log_debug(
"windows cudaErrorCudartUnloading happened!!, resource "
"recovery by OS!!");
return true;
}
#endif
return false; return false;
} }


@@ -733,11 +745,29 @@ void CudaCompNode::finalize() {
} }
} }


CompNode::Impl* CudaCompNode::load_cuda(
const Locator &locator, const Locator &locator_logical) {
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
bool CudaCompNode::is_into_atexit = false;
#endif
CompNode::Impl* CudaCompNode::load_cuda(const Locator& locator,
const Locator& locator_logical) {
int nr_gpu = get_device_count(); int nr_gpu = get_device_count();
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (!is_into_atexit) {
auto err = atexit([] { is_into_atexit = true; });
mgb_assert(!err, "failed to register atexit function");
}
#endif
mgb_assert(locator.device >= 0 && locator.device < nr_gpu, mgb_assert(locator.device >= 0 && locator.device < nr_gpu,
"request gpu%d out of valid range [0, %d)", locator.device, nr_gpu);
"request gpu%d out of valid range [0, %d)", locator.device,
nr_gpu);


auto &&sdptr = CudaCompNodeImpl::sd; auto &&sdptr = CudaCompNodeImpl::sd;
{ {


+ 7
- 0
src/core/impl/comp_node/cuda/comp_node.h View File

@@ -36,6 +36,13 @@ namespace mgb {


static void set_prealloc_config(size_t alignment, size_t min_req, static void set_prealloc_config(size_t alignment, size_t min_req,
size_t max_overhead, double growth_factor); size_t max_overhead, double growth_factor);
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function
//! even register atexit function after init cuda driver! as a
//! workround recovery resource by OS temporarily, may need remove
//! this after upgrade cuda runtime
static bool is_into_atexit;
#endif
}; };
} }




Loading…
Cancel
Save