diff --git a/imperative/python/megengine/device.py b/imperative/python/megengine/device.py index 0b1498b1..9ba1c4d6 100644 --- a/imperative/python/megengine/device.py +++ b/imperative/python/megengine/device.py @@ -25,6 +25,11 @@ __all__ = [ "set_default_device", "get_mem_status_bytes", "get_cuda_compute_capability", + "get_allocated_memory", + "get_reserved_memory", + "get_max_reserved_memory", + "get_max_allocated_memory", + "reset_max_memory_stats", "set_prealloc_config", "coalesce_free_memory", "DeviceType", @@ -157,6 +162,61 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int return _get_cuda_compute_capability(device, device_type) +def get_allocated_memory(device: Optional[str] = None): + r"""Returns the current memory occupied by tensors on the computing device in bytes. + + Due to the asynchronous execution of MegEngine, please call megengine._full_sync + before calling this function in order to get accurate value. + """ + if device is None: + device = get_default_device() + return CompNode(device).get_used_memory + + +def get_reserved_memory(device: Optional[str] = None): + r"""Returns the current memory managed by the caching allocator on the computing device in bytes. + + Due to the asynchronous execution of MegEngine, please call megengine._full_sync + before calling this function in order to get accurate value. + """ + if device is None: + device = get_default_device() + return CompNode(device).get_reserved_memory + + +def get_max_reserved_memory(device: Optional[str] = None): + r"""Returns the maximum memory managed by the caching allocator on the computing device in bytes. + + Due to the asynchronous execution of MegEngine, please call megengine._full_sync + before calling this function in order to get accurate value. + """ + if device is None: + device = get_default_device() + return CompNode(device).get_max_reserved_memory + + +def get_max_allocated_memory(device: Optional[str] = None): + r"""Returns the maximum memory occupied by tensors on the computing device in bytes. + + Due to the asynchronous execution of MegEngine, please call megengine._full_sync + before calling this function in order to get accurate value. + """ + if device is None: + device = get_default_device() + return CompNode(device).get_max_used_memory + + +def reset_max_memory_stats(device: Optional[str] = None): + r"""Resets the maximum stats on the computing device. + + Due to the asynchronous execution of MegEngine, please call megengine._full_sync + before calling this function in order to properly reset memory stats. + """ + if device is None: + device = get_default_device() + CompNode.reset_max_memory_stats(device) + + set_default_device(os.getenv("MGE_DEFAULT_DEVICE", "xpux")) diff --git a/imperative/python/src/common.cpp b/imperative/python/src/common.cpp index 5c10b761..80483c97 100644 --- a/imperative/python/src/common.cpp +++ b/imperative/python/src/common.cpp @@ -73,6 +73,26 @@ void init_common(py::module m) { [](const CompNode& cn) { return cn.get_mem_status_bytes(); }) + .def_property_readonly( + "get_used_memory", + [](const CompNode& cn) { return cn.get_used_memory(); }) + .def_property_readonly( + "get_max_used_memory", + [](const CompNode& cn) { return cn.get_max_used_memory(); }) + .def_property_readonly( + "get_reserved_memory", + [](const CompNode& cn) { return cn.get_reserved_memory(); }) + .def_property_readonly( + "get_max_reserved_memory", + [](const CompNode& cn) { + return cn.get_max_reserved_memory(); + }) + .def_static( + "reset_max_memory_stats", + [](const CompNode& cn) { + cn.reset_max_used_memory(); + cn.reset_max_reserved_memory(); + }) .def("create_event", &CompNode::create_event, py::arg("flags") = 0ul) .def_static("_set_default_device", &set_default_device) diff --git a/src/core/impl/comp_node/cuda/comp_node.cpp b/src/core/impl/comp_node/cuda/comp_node.cpp index e37351be..5891a7c5 100644 --- a/src/core/impl/comp_node/cuda/comp_node.cpp +++ b/src/core/impl/comp_node/cuda/comp_node.cpp @@ -208,20 +208,7 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { public: CompNodeImpl() : Impl(static_free_device, static_free_host) {} - void* alloc_device(size_t size) override { - activate(); -#if MGB_BUILD_SLIM_SERVING - return m_mem_alloc->alloc(size); -#else - void* ptr = m_mem_alloc->alloc(size); - { - MGB_LOCK_GUARD(m_update_mem); - ptr2size[ptr] = size; - m_used_mem += size; - } - return ptr; -#endif - } + void* alloc_device(size_t size) override; void free_device(void* ptr); @@ -311,20 +298,30 @@ public: uint64_t get_uid() override { return m_uid; } #if !MGB_BUILD_SLIM_SERVING - size_t get_used_memory() override { return m_used_mem; } + size_t get_used_memory() override; + + size_t get_max_used_memory() override; + + size_t get_reserved_memory() override; + + size_t get_max_reserved_memory() override; + + void reset_max_used_memory() override; + void reset_max_reserved_memory() override; #endif private: uint64_t m_uid; #if !MGB_BUILD_SLIM_SERVING std::unordered_map ptr2size; - size_t m_used_mem = 0; #endif }; MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl); struct CudaCompNodeImpl::DeviceInfo { int dev_num = -1; + std::atomic_size_t m_used_mem{0}; + std::atomic_size_t m_max_used_mem{0}; std::unique_ptr mem_alloc; bool init_done() const { return mem_alloc.get(); } @@ -438,6 +435,24 @@ void CudaCompNodeImpl::fini() { m_initialized = false; } +void* CudaCompNodeImpl::alloc_device(size_t size) { + activate(); +#if MGB_BUILD_SLIM_SERVING + return m_mem_alloc->alloc(size); +#else + void* ptr = m_mem_alloc->alloc(size); + { + MGB_LOCK_GUARD(m_update_mem); + ptr2size[ptr] = size; + m_device_info->m_used_mem += size; + if (m_device_info->m_used_mem > m_device_info->m_max_used_mem) { + m_device_info->m_max_used_mem = m_device_info->m_used_mem.load(); + } + } + return ptr; +#endif +} + void CudaCompNodeImpl::free_device(void* ptr) { if (check_global_finalized()) return; @@ -447,13 +462,39 @@ void CudaCompNodeImpl::free_device(void* ptr) { { MGB_LOCK_GUARD(m_update_mem); mgb_assert(ptr2size.find(ptr) != ptr2size.end(), "ptr %p not found!", ptr); - m_used_mem -= ptr2size.at(ptr); + m_device_info->m_used_mem -= ptr2size.at(ptr); ptr2size.erase(ptr); } #endif m_mem_alloc->free(ptr); } +#if !MGB_BUILD_SLIM_SERVING +size_t CudaCompNodeImpl::get_used_memory() { + return m_device_info->m_used_mem.load(); +} + +size_t CudaCompNodeImpl::get_max_used_memory() { + return m_device_info->m_max_used_mem.load(); +} + +void CudaCompNodeImpl::reset_max_used_memory() { + m_device_info->m_max_used_mem = 0; +} + +size_t CudaCompNodeImpl::get_reserved_memory() { + return m_device_info->mem_alloc->get_used_memory(); +} + +size_t CudaCompNodeImpl::get_max_reserved_memory() { + return m_device_info->mem_alloc->get_max_used_memory(); +} + +void CudaCompNodeImpl::reset_max_reserved_memory() { + m_device_info->mem_alloc->reset_max_used_memory(); +} +#endif + void* CudaCompNodeImpl::alloc_host(size_t size) { // need activate because it create cuda cuda context in current device activate(); diff --git a/src/core/impl/comp_node/mem_alloc/impl.cpp b/src/core/impl/comp_node/mem_alloc/impl.cpp index f73f5f7e..175b5e86 100644 --- a/src/core/impl/comp_node/mem_alloc/impl.cpp +++ b/src/core/impl/comp_node/mem_alloc/impl.cpp @@ -226,6 +226,9 @@ StreamMemAlloc* DevMemAllocImpl::add_stream(StreamKey stream) { MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc(size_t size) { auto addr = do_alloc(size, true); m_used_size += size; + if (m_used_size > m_max_used_size) { + m_max_used_size = m_used_size.load(); + } return addr; } @@ -291,6 +294,9 @@ MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc_from_parent(size_t size) { // exception would be thrown from here auto t = do_alloc(size, false, true); m_used_size += size; + if (m_used_size > m_max_used_size) { + m_max_used_size = m_used_size.load(); + } return t; } } @@ -419,6 +425,9 @@ void DevMemAllocImpl::insert_free_unsafe(const FreeBlock& block) { child->insert_free_unsafe(block); } m_used_size += block.size; + if (m_used_size > m_max_used_size) { + m_max_used_size = m_used_size.load(); + } } else { MemAllocImplHelper::insert_free_unsafe(block); } diff --git a/src/core/impl/comp_node/mem_alloc/impl.h b/src/core/impl/comp_node/mem_alloc/impl.h index fa8a305b..5858c09a 100644 --- a/src/core/impl/comp_node/mem_alloc/impl.h +++ b/src/core/impl/comp_node/mem_alloc/impl.h @@ -171,6 +171,7 @@ class DevMemAllocImpl final : public DevMemAlloc, public MemAllocImplHelper { size_t m_tot_allocated_from_raw = 0; std::atomic_size_t m_used_size{0}; + std::atomic_size_t m_max_used_size{0}; /*! * \brief gather all free blocks from child streams, and release full chunks @@ -197,6 +198,10 @@ class DevMemAllocImpl final : public DevMemAlloc, public MemAllocImplHelper { size_t get_used_memory() override { return m_used_size.load(); } + size_t get_max_used_memory() override { return m_max_used_size.load(); } + + void reset_max_used_memory() override { m_max_used_size = 0; } + void insert_free_unsafe(const FreeBlock& block) override; /*! diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h index 7ca88006..d672a803 100644 --- a/src/core/include/megbrain/comp_node.h +++ b/src/core/include/megbrain/comp_node.h @@ -335,11 +335,23 @@ public: size_t get_used_memory() const { return m_impl->get_used_memory(); } + size_t get_reserved_memory() const { return m_impl->get_reserved_memory(); } + + size_t get_max_reserved_memory() const { return m_impl->get_max_reserved_memory(); } + + size_t get_max_used_memory() const { return m_impl->get_max_used_memory(); } + size_t get_max_block_size_available() const { return m_impl->get_max_block_size_available(); } size_t get_free_mem() const { return m_impl->get_free_mem(); } + + void reset_max_reserved_memory() const { + return m_impl->reset_max_reserved_memory(); + } + + void reset_max_used_memory() const { return m_impl->reset_max_used_memory(); } #endif //! change to another stream on the same memory node @@ -533,8 +545,13 @@ protected: return {x - x, y - y}; } virtual size_t get_used_memory() { return 0; } + virtual size_t get_reserved_memory() { return 0; } + virtual size_t get_max_reserved_memory() { return 0; } + virtual size_t get_max_used_memory() { return 0; } virtual size_t get_max_block_size_available() { return 0; } virtual size_t get_free_mem() { return 0; } + virtual void reset_max_reserved_memory() {} + virtual void reset_max_used_memory() {} #endif virtual Locator locator() = 0; diff --git a/src/core/include/megbrain/comp_node/alloc.h b/src/core/include/megbrain/comp_node/alloc.h index 92464967..601a3330 100644 --- a/src/core/include/megbrain/comp_node/alloc.h +++ b/src/core/include/megbrain/comp_node/alloc.h @@ -275,6 +275,10 @@ public: const PreAllocConfig& prealloc_config() { return m_prealloc_config; } + virtual size_t get_used_memory() { return 0; } + virtual size_t get_max_used_memory() { return 0; } + virtual void reset_max_used_memory() {} + private: size_t m_alignment = 1; PreAllocConfig m_prealloc_config;