GitOrigin-RevId: de9e7d7f16
release-1.7
@@ -10,7 +10,6 @@ import re | |||||
from typing import Union | from typing import Union | ||||
from ..core._imperative_rt.core2 import set_option as _set_option | from ..core._imperative_rt.core2 import set_option as _set_option | ||||
from ..core._imperative_rt.utils import _set_defrag | |||||
_eviction_threshold = 0 | _eviction_threshold = 0 | ||||
_evictee_minimum_size = 1024 ** 2 | _evictee_minimum_size = 1024 ** 2 | ||||
@@ -216,9 +216,6 @@ void init_utils(py::module m) { | |||||
#endif | #endif | ||||
// Debug code, internal only | // Debug code, internal only | ||||
m.def("_set_defrag", [](bool enable) { | |||||
mgb::imperative::BlobManager::inst()->set_enable(enable); | |||||
}); | |||||
m.def("_defrag", [](const mgb::CompNode& cn) { | m.def("_defrag", [](const mgb::CompNode& cn) { | ||||
mgb::imperative::BlobManager::inst()->defrag(cn); | mgb::imperative::BlobManager::inst()->defrag(cn); | ||||
}); | }); | ||||
@@ -41,22 +41,14 @@ void BlobManagerImpl::unregister_blob(Blob* blob) { | |||||
} | } | ||||
void BlobManagerImpl::alloc_with_defrag(Blob* blob, size_t size) { | void BlobManagerImpl::alloc_with_defrag(Blob* blob, size_t size) { | ||||
if (!m_enable) { | |||||
// try alloc | |||||
MGB_TRY { alloc_direct(blob, size); } | |||||
// if fail, try defrag, alloc again | |||||
MGB_CATCH(MemAllocError&, { | |||||
mgb_log_warn("memory allocation failed for blob; try defragmenting"); | |||||
defrag(blob->m_comp_node); | |||||
alloc_direct(blob, size); | alloc_direct(blob, size); | ||||
} else { | |||||
// // debug | |||||
// defrag(blob->m_comp_node); | |||||
// alloc_direct(blob, storage, size); | |||||
// try alloc | |||||
MGB_TRY { alloc_direct(blob, size); } | |||||
// if fail, try defrag, alloc again | |||||
MGB_CATCH(MemAllocError&, { | |||||
mgb_log_warn("memory allocation failed for blob; try defragmenting"); | |||||
defrag(blob->m_comp_node); | |||||
alloc_direct(blob, size); | |||||
}); | |||||
} | |||||
}); | |||||
} | } | ||||
void BlobManagerImpl::alloc_direct(Blob* blob, size_t size) { | void BlobManagerImpl::alloc_direct(Blob* blob, size_t size) { | ||||
@@ -69,16 +61,12 @@ void BlobManagerImpl::alloc_direct(Blob* blob, size_t size) { | |||||
DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag( | DeviceTensorND BlobManagerImpl::alloc_workspace_with_defrag( | ||||
CompNode cn, TensorLayout layout) { | CompNode cn, TensorLayout layout) { | ||||
DeviceTensorND dev_tensor; | DeviceTensorND dev_tensor; | ||||
if (!m_enable) { | |||||
MGB_TRY { dev_tensor = alloc_workspace(cn, layout); } | |||||
MGB_CATCH(MemAllocError&, { | |||||
mgb_log_warn("memory allocation failed for workspace; try defragmenting"); | |||||
defrag(cn); | |||||
dev_tensor = alloc_workspace(cn, layout); | dev_tensor = alloc_workspace(cn, layout); | ||||
} else { | |||||
MGB_TRY { dev_tensor = alloc_workspace(cn, layout); } | |||||
MGB_CATCH(MemAllocError&, { | |||||
mgb_log_warn("memory allocation failed for workspace; try defragmenting"); | |||||
defrag(cn); | |||||
dev_tensor = alloc_workspace(cn, layout); | |||||
}); | |||||
} | |||||
}); | |||||
return dev_tensor; | return dev_tensor; | ||||
}; | }; | ||||
@@ -154,10 +142,6 @@ void BlobManagerImpl::defrag(const CompNode& cn) { | |||||
cn.sync(); | cn.sync(); | ||||
} | } | ||||
void BlobManagerImpl::set_enable(bool flag) { | |||||
m_enable = flag; | |||||
} | |||||
struct BlobManagerStub : BlobManager { | struct BlobManagerStub : BlobManager { | ||||
void alloc_direct(Blob* blob, size_t size) { | void alloc_direct(Blob* blob, size_t size) { | ||||
mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
@@ -172,9 +156,6 @@ struct BlobManagerStub : BlobManager { | |||||
mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
}; | }; | ||||
void unregister_blob(Blob* blob){}; | void unregister_blob(Blob* blob){}; | ||||
void set_enable(bool flag) { | |||||
mgb_assert(0, "prohibited after global variable destruction"); | |||||
}; | |||||
void defrag(const CompNode& cn) { | void defrag(const CompNode& cn) { | ||||
mgb_assert(0, "prohibited after global variable destruction"); | mgb_assert(0, "prohibited after global variable destruction"); | ||||
}; | }; | ||||
@@ -38,7 +38,6 @@ class BlobManagerImpl final : public BlobManager { | |||||
std::mutex m_mtx; | std::mutex m_mtx; | ||||
CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map; | CompNode::UnorderedMap<BlobSetWithMux> m_comp2blobs_map; | ||||
bool m_enable = true; | |||||
void defrag(const CompNode& cn) override; | void defrag(const CompNode& cn) override; | ||||
@@ -57,8 +56,6 @@ public: | |||||
void register_blob(Blob* blob) override; | void register_blob(Blob* blob) override; | ||||
void unregister_blob(Blob* blob) override; | void unregister_blob(Blob* blob) override; | ||||
void set_enable(bool flag) override; | |||||
}; | }; | ||||
} // namespace imperative | } // namespace imperative | ||||
@@ -33,8 +33,6 @@ public: | |||||
virtual void unregister_blob(Blob* blob) = 0; | virtual void unregister_blob(Blob* blob) = 0; | ||||
virtual void set_enable(bool flag) = 0; | |||||
virtual void defrag(const CompNode& cn) = 0; | virtual void defrag(const CompNode& cn) = 0; | ||||
}; | }; | ||||
@@ -94,15 +94,13 @@ TEST(TestImperative, Split) { | |||||
} | } | ||||
#if MGB_CUDA && MGB_ENABLE_EXCEPTION | #if MGB_CUDA && MGB_ENABLE_EXCEPTION | ||||
void run_graph(size_t mem_reserved, bool enable_defrag) { | |||||
void run_graph(size_t mem_reserved) { | |||||
CompNode::try_coalesce_all_free_memory(); | CompNode::try_coalesce_all_free_memory(); | ||||
CompNode::finalize(); | CompNode::finalize(); | ||||
auto cn = CompNode::load("gpux"); | auto cn = CompNode::load("gpux"); | ||||
cn.sync(); // wait for async init to finish | cn.sync(); // wait for async init to finish | ||||
BlobManager::inst()->set_enable(enable_defrag); | |||||
HostTensorGenerator<> gen; | HostTensorGenerator<> gen; | ||||
using TensorPtr = std::shared_ptr<Tensor>; | using TensorPtr = std::shared_ptr<Tensor>; | ||||
TensorPtr ptr_a[100]; | TensorPtr ptr_a[100]; | ||||
@@ -159,10 +157,7 @@ TEST(TestImperative, Defragment) { | |||||
} | } | ||||
auto reserve_setting = ssprintf("b:%zu", reserve); | auto reserve_setting = ssprintf("b:%zu", reserve); | ||||
auto do_run = [reserve]() { | |||||
ASSERT_THROW(run_graph(reserve, false), MemAllocError); | |||||
run_graph(reserve, true); | |||||
}; | |||||
auto do_run = [reserve]() { run_graph(reserve); }; | |||||
// reserve memory explicitly to avoid uncontrollable factors | // reserve memory explicitly to avoid uncontrollable factors | ||||
constexpr const char* KEY = "MGB_CUDA_RESERVE_MEMORY"; | constexpr const char* KEY = "MGB_CUDA_RESERVE_MEMORY"; | ||||