Browse Source

fix(mgb/windows): temporary workround on cuda-windows python exit

code(127), as windows cuda driver unloading before atexit function
may remove this after upgrade cuda runtime

GitOrigin-RevId: cac37ca3dd
release-1.5
Megvii Engine Team 4 years ago
parent
commit
2d6827c168
4 changed files with 80 additions and 3 deletions
  1. +33
    -0
      imperative/src/impl/physical_tensor.cpp
  2. +7
    -0
      src/core/impl/comp_node/comp_node.cpp
  3. +33
    -3
      src/core/impl/comp_node/cuda/comp_node.cpp
  4. +7
    -0
      src/core/impl/comp_node/cuda/comp_node.h

+ 33
- 0
imperative/src/impl/physical_tensor.cpp View File

@@ -26,6 +26,13 @@ class CompNodeSyncManager : public CompNodeDepedentObject {
ThinHashMap<Blob*, std::unique_ptr<CompNode::Event>> m_blob2event;
std::mutex m_mtx;
public:
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
static bool is_into_atexit;
#endif
std::shared_ptr<void> on_comp_node_finalize() override {
MGB_LOCK_GUARD(m_mtx);
m_blob2event.clear();
@@ -34,6 +41,16 @@ public:

static CompNodeSyncManager& inst() {
static CompNodeSyncManager sl_inst;
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (!is_into_atexit) {
auto err = atexit([] { is_into_atexit = true; });
mgb_assert(!err, "failed to register atexit function");
}
#endif
return sl_inst;
}

@@ -52,6 +69,13 @@ public:
m_blob2event.erase(blob);
}
};
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
bool CompNodeSyncManager::is_into_atexit = false;
#endif

// Cache for small blobs
// 1. A blob has to be seen twice (within a window) to be eligible for cache
@@ -221,6 +245,15 @@ Blob::Blob(CompNode cn, size_t sz):

Blob::~Blob() {
BlobManager::inst()->unregister_blob(this);

#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (CompNodeSyncManager::is_into_atexit)
return;
#endif
CompNodeSyncManager::inst().remove(this);
}



+ 7
- 0
src/core/impl/comp_node/comp_node.cpp View File

@@ -556,6 +556,13 @@ CompNode CompNode::load(const Locator& locator_physical,
}

void CompNode::finalize() {
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround recovery
//! resource by OS temporarily, may need remove this after upgrade cuda
//! runtime
return;
#endif
comp_node_detail::DepedentObjList::invoke_callback_and_clean();
CudaCompNode::finalize();
CpuCompNode::finalize();


+ 33
- 3
src/core/impl/comp_node/cuda/comp_node.cpp View File

@@ -614,6 +614,18 @@ bool CudaCompNodeImpl::check_global_finalized() {
}
return true;
}
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (CudaCompNode::is_into_atexit) {
mgb_log_debug(
"windows cudaErrorCudartUnloading happened!!, resource "
"recovery by OS!!");
return true;
}
#endif
return false;
}

@@ -733,11 +745,29 @@ void CudaCompNode::finalize() {
}
}

CompNode::Impl* CudaCompNode::load_cuda(
const Locator &locator, const Locator &locator_logical) {
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
bool CudaCompNode::is_into_atexit = false;
#endif
CompNode::Impl* CudaCompNode::load_cuda(const Locator& locator,
const Locator& locator_logical) {
int nr_gpu = get_device_count();
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function even
//! register atexit function after init cuda driver! as a workround
//! recovery resource by OS temporarily, may need remove this after
//! upgrade cuda runtime
if (!is_into_atexit) {
auto err = atexit([] { is_into_atexit = true; });
mgb_assert(!err, "failed to register atexit function");
}
#endif
mgb_assert(locator.device >= 0 && locator.device < nr_gpu,
"request gpu%d out of valid range [0, %d)", locator.device, nr_gpu);
"request gpu%d out of valid range [0, %d)", locator.device,
nr_gpu);

auto &&sdptr = CudaCompNodeImpl::sd;
{


+ 7
- 0
src/core/impl/comp_node/cuda/comp_node.h View File

@@ -36,6 +36,13 @@ namespace mgb {

static void set_prealloc_config(size_t alignment, size_t min_req,
size_t max_overhead, double growth_factor);
#if MGB_CUDA && defined(WIN32)
//! FIXME: windows cuda driver shutdown before call atexit function
//! even register atexit function after init cuda driver! as a
//! workround recovery resource by OS temporarily, may need remove
//! this after upgrade cuda runtime
static bool is_into_atexit;
#endif
};
}



Loading…
Cancel
Save