diff --git a/imperative/python/megengine/device.py b/imperative/python/megengine/device.py
index 0b1498b1..9ba1c4d6 100644
--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
@@ -25,6 +25,11 @@ __all__ = [
     "set_default_device",
     "get_mem_status_bytes",
     "get_cuda_compute_capability",
+    "get_allocated_memory",
+    "get_reserved_memory",
+    "get_max_reserved_memory",
+    "get_max_allocated_memory",
+    "reset_max_memory_stats",
     "set_prealloc_config",
     "coalesce_free_memory",
     "DeviceType",
@@ -157,6 +162,61 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int
     return _get_cuda_compute_capability(device, device_type)
 
 
+def get_allocated_memory(device: Optional[str] = None):
+    r"""Returns the current memory occupied by tensors on the computing device in bytes.
+
+    Due to the asynchronous execution of MegEngine, please call megengine._full_sync
+    before calling this function in order to get accurate value.
+    """
+    if device is None:
+        device = get_default_device()
+    return CompNode(device).get_used_memory
+
+
+def get_reserved_memory(device: Optional[str] = None):
+    r"""Returns the current memory managed by the caching allocator on the computing device in bytes.
+
+    Due to the asynchronous execution of MegEngine, please call megengine._full_sync
+    before calling this function in order to get accurate value.
+    """
+    if device is None:
+        device = get_default_device()
+    return CompNode(device).get_reserved_memory
+
+
+def get_max_reserved_memory(device: Optional[str] = None):
+    r"""Returns the maximum memory managed by the caching allocator on the computing device in bytes.
+
+    Due to the asynchronous execution of MegEngine, please call megengine._full_sync
+    before calling this function in order to get accurate value.
+    """
+    if device is None:
+        device = get_default_device()
+    return CompNode(device).get_max_reserved_memory
+
+
+def get_max_allocated_memory(device: Optional[str] = None):
+    r"""Returns the maximum memory occupied by tensors on the computing device in bytes.
+
+    Due to the asynchronous execution of MegEngine, please call megengine._full_sync
+    before calling this function in order to get accurate value.
+    """
+    if device is None:
+        device = get_default_device()
+    return CompNode(device).get_max_used_memory
+
+
+def reset_max_memory_stats(device: Optional[str] = None):
+    r"""Resets the maximum stats on the computing device.
+
+    Due to the asynchronous execution of MegEngine, please call megengine._full_sync
+    before calling this function in order to properly reset memory stats.
+    """
+    if device is None:
+        device = get_default_device()
+    CompNode.reset_max_memory_stats(device)
+
+
 set_default_device(os.getenv("MGE_DEFAULT_DEVICE", "xpux"))
 
 
diff --git a/imperative/python/src/common.cpp b/imperative/python/src/common.cpp
index 5c10b761..80483c97 100644
--- a/imperative/python/src/common.cpp
+++ b/imperative/python/src/common.cpp
@@ -73,6 +73,26 @@ void init_common(py::module m) {
                             [](const CompNode& cn) {
                                 return cn.get_mem_status_bytes();
                             })
+                    .def_property_readonly(
+                            "get_used_memory",
+                            [](const CompNode& cn) { return cn.get_used_memory(); })
+                    .def_property_readonly(
+                            "get_max_used_memory",
+                            [](const CompNode& cn) { return cn.get_max_used_memory(); })
+                    .def_property_readonly(
+                            "get_reserved_memory",
+                            [](const CompNode& cn) { return cn.get_reserved_memory(); })
+                    .def_property_readonly(
+                            "get_max_reserved_memory",
+                            [](const CompNode& cn) {
+                                return cn.get_max_reserved_memory();
+                            })
+                    .def_static(
+                            "reset_max_memory_stats",
+                            [](const CompNode& cn) {
+                                cn.reset_max_used_memory();
+                                cn.reset_max_reserved_memory();
+                            })
                     .def("create_event", &CompNode::create_event,
                          py::arg("flags") = 0ul)
                     .def_static("_set_default_device", &set_default_device)
diff --git a/src/core/impl/comp_node/cuda/comp_node.cpp b/src/core/impl/comp_node/cuda/comp_node.cpp
index e37351be..5891a7c5 100644
--- a/src/core/impl/comp_node/cuda/comp_node.cpp
+++ b/src/core/impl/comp_node/cuda/comp_node.cpp
@@ -208,20 +208,7 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
 public:
     CompNodeImpl() : Impl(static_free_device, static_free_host) {}
 
-    void* alloc_device(size_t size) override {
-        activate();
-#if MGB_BUILD_SLIM_SERVING
-        return m_mem_alloc->alloc(size);
-#else
-        void* ptr = m_mem_alloc->alloc(size);
-        {
-            MGB_LOCK_GUARD(m_update_mem);
-            ptr2size[ptr] = size;
-            m_used_mem += size;
-        }
-        return ptr;
-#endif
-    }
+    void* alloc_device(size_t size) override;
 
     void free_device(void* ptr);
 
@@ -311,20 +298,30 @@ public:
     uint64_t get_uid() override { return m_uid; }
 
 #if !MGB_BUILD_SLIM_SERVING
-    size_t get_used_memory() override { return m_used_mem; }
+    size_t get_used_memory() override;
+
+    size_t get_max_used_memory() override;
+
+    size_t get_reserved_memory() override;
+
+    size_t get_max_reserved_memory() override;
+
+    void reset_max_used_memory() override;
+    void reset_max_reserved_memory() override;
 #endif
 
 private:
     uint64_t m_uid;
 #if !MGB_BUILD_SLIM_SERVING
     std::unordered_map<void*, size_t> ptr2size;
-    size_t m_used_mem = 0;
 #endif
 };
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(CudaCompNode::CompNodeImpl);
 
 struct CudaCompNodeImpl::DeviceInfo {
     int dev_num = -1;
+    std::atomic_size_t m_used_mem{0};
+    std::atomic_size_t m_max_used_mem{0};
     std::unique_ptr<mem_alloc::DevMemAlloc> mem_alloc;
 
     bool init_done() const { return mem_alloc.get(); }
@@ -438,6 +435,24 @@ void CudaCompNodeImpl::fini() {
     m_initialized = false;
 }
 
+void* CudaCompNodeImpl::alloc_device(size_t size) {
+    activate();
+#if MGB_BUILD_SLIM_SERVING
+    return m_mem_alloc->alloc(size);
+#else
+    void* ptr = m_mem_alloc->alloc(size);
+    {
+        MGB_LOCK_GUARD(m_update_mem);
+        ptr2size[ptr] = size;
+        m_device_info->m_used_mem += size;
+        if (m_device_info->m_used_mem > m_device_info->m_max_used_mem) {
+            m_device_info->m_max_used_mem = m_device_info->m_used_mem.load();
+        }
+    }
+    return ptr;
+#endif
+}
+
 void CudaCompNodeImpl::free_device(void* ptr) {
     if (check_global_finalized())
         return;
@@ -447,13 +462,39 @@ void CudaCompNodeImpl::free_device(void* ptr) {
     {
         MGB_LOCK_GUARD(m_update_mem);
         mgb_assert(ptr2size.find(ptr) != ptr2size.end(), "ptr %p not found!", ptr);
-        m_used_mem -= ptr2size.at(ptr);
+        m_device_info->m_used_mem -= ptr2size.at(ptr);
         ptr2size.erase(ptr);
     }
 #endif
     m_mem_alloc->free(ptr);
 }
 
+#if !MGB_BUILD_SLIM_SERVING
+size_t CudaCompNodeImpl::get_used_memory() {
+    return m_device_info->m_used_mem.load();
+}
+
+size_t CudaCompNodeImpl::get_max_used_memory() {
+    return m_device_info->m_max_used_mem.load();
+}
+
+void CudaCompNodeImpl::reset_max_used_memory() {
+    m_device_info->m_max_used_mem = 0;
+}
+
+size_t CudaCompNodeImpl::get_reserved_memory() {
+    return m_device_info->mem_alloc->get_used_memory();
+}
+
+size_t CudaCompNodeImpl::get_max_reserved_memory() {
+    return m_device_info->mem_alloc->get_max_used_memory();
+}
+
+void CudaCompNodeImpl::reset_max_reserved_memory() {
+    m_device_info->mem_alloc->reset_max_used_memory();
+}
+#endif
+
 void* CudaCompNodeImpl::alloc_host(size_t size) {
     // need activate because it create cuda cuda context in current device
     activate();
diff --git a/src/core/impl/comp_node/mem_alloc/impl.cpp b/src/core/impl/comp_node/mem_alloc/impl.cpp
index f73f5f7e..175b5e86 100644
--- a/src/core/impl/comp_node/mem_alloc/impl.cpp
+++ b/src/core/impl/comp_node/mem_alloc/impl.cpp
@@ -226,6 +226,9 @@ StreamMemAlloc* DevMemAllocImpl::add_stream(StreamKey stream) {
 MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc(size_t size) {
     auto addr = do_alloc(size, true);
     m_used_size += size;
+    if (m_used_size > m_max_used_size) {
+        m_max_used_size = m_used_size.load();
+    }
     return addr;
 }
 
@@ -291,6 +294,9 @@ MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc_from_parent(size_t size) {
                 // exception would be thrown from here
                 auto t = do_alloc(size, false, true);
                 m_used_size += size;
+                if (m_used_size > m_max_used_size) {
+                    m_max_used_size = m_used_size.load();
+                }
                 return t;
             }
         }
@@ -419,6 +425,9 @@ void DevMemAllocImpl::insert_free_unsafe(const FreeBlock& block) {
             child->insert_free_unsafe(block);
         }
         m_used_size += block.size;
+        if (m_used_size > m_max_used_size) {
+            m_max_used_size = m_used_size.load();
+        }
     } else {
         MemAllocImplHelper::insert_free_unsafe(block);
     }
diff --git a/src/core/impl/comp_node/mem_alloc/impl.h b/src/core/impl/comp_node/mem_alloc/impl.h
index fa8a305b..5858c09a 100644
--- a/src/core/impl/comp_node/mem_alloc/impl.h
+++ b/src/core/impl/comp_node/mem_alloc/impl.h
@@ -171,6 +171,7 @@ class DevMemAllocImpl final : public DevMemAlloc, public MemAllocImplHelper {
 
     size_t m_tot_allocated_from_raw = 0;
     std::atomic_size_t m_used_size{0};
+    std::atomic_size_t m_max_used_size{0};
 
     /*!
      * \brief gather all free blocks from child streams, and release full chunks
@@ -197,6 +198,10 @@ class DevMemAllocImpl final : public DevMemAlloc, public MemAllocImplHelper {
 
     size_t get_used_memory() override { return m_used_size.load(); }
 
+    size_t get_max_used_memory() override { return m_max_used_size.load(); }
+
+    void reset_max_used_memory() override { m_max_used_size = 0; }
+
     void insert_free_unsafe(const FreeBlock& block) override;
 
     /*!
diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h
index 7ca88006..d672a803 100644
--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -335,11 +335,23 @@ public:
 
     size_t get_used_memory() const { return m_impl->get_used_memory(); }
 
+    size_t get_reserved_memory() const { return m_impl->get_reserved_memory(); }
+
+    size_t get_max_reserved_memory() const { return m_impl->get_max_reserved_memory(); }
+
+    size_t get_max_used_memory() const { return m_impl->get_max_used_memory(); }
+
     size_t get_max_block_size_available() const {
         return m_impl->get_max_block_size_available();
     }
 
     size_t get_free_mem() const { return m_impl->get_free_mem(); }
+
+    void reset_max_reserved_memory() const {
+        return m_impl->reset_max_reserved_memory();
+    }
+
+    void reset_max_used_memory() const { return m_impl->reset_max_used_memory(); }
 #endif
 
     //! change to another stream on the same memory node
@@ -533,8 +545,13 @@ protected:
             return {x - x, y - y};
         }
         virtual size_t get_used_memory() { return 0; }
+        virtual size_t get_reserved_memory() { return 0; }
+        virtual size_t get_max_reserved_memory() { return 0; }
+        virtual size_t get_max_used_memory() { return 0; }
         virtual size_t get_max_block_size_available() { return 0; }
         virtual size_t get_free_mem() { return 0; }
+        virtual void reset_max_reserved_memory() {}
+        virtual void reset_max_used_memory() {}
 #endif
 
         virtual Locator locator() = 0;
diff --git a/src/core/include/megbrain/comp_node/alloc.h b/src/core/include/megbrain/comp_node/alloc.h
index 92464967..601a3330 100644
--- a/src/core/include/megbrain/comp_node/alloc.h
+++ b/src/core/include/megbrain/comp_node/alloc.h
@@ -275,6 +275,10 @@ public:
 
     const PreAllocConfig& prealloc_config() { return m_prealloc_config; }
 
+    virtual size_t get_used_memory() { return 0; }
+    virtual size_t get_max_used_memory() { return 0; }
+    virtual void reset_max_used_memory() {}
+
 private:
     size_t m_alignment = 1;
     PreAllocConfig m_prealloc_config;