From b43fb1a97ca380c45857c5071667e6e695adfd6b Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 29 Jul 2020 14:48:54 +0800 Subject: [PATCH] perf(mgb): add CUDA host memory allocator test(mgb): add SimpleCachingAlloc test GitOrigin-RevId: 17f381e4ace4db06ea9d3e659da87646c68aa19c --- src/core/impl/comp_node/cuda/comp_node.cpp | 72 +++++++++++++++++++++++------ src/core/impl/comp_node/mem_alloc/impl.cpp | 54 +++++++++++++++++++++- src/core/impl/comp_node/mem_alloc/impl.h | 27 ++++++++++- src/core/include/megbrain/comp_node/alloc.h | 26 +++++++++++ src/core/test/mem_alloc.cpp | 48 +++++++++++++++++++ 5 files changed, 212 insertions(+), 15 deletions(-) diff --git a/src/core/impl/comp_node/cuda/comp_node.cpp b/src/core/impl/comp_node/cuda/comp_node.cpp index 467931ce..b12e8167 100644 --- a/src/core/impl/comp_node/cuda/comp_node.cpp +++ b/src/core/impl/comp_node/cuda/comp_node.cpp @@ -99,6 +99,46 @@ public: } }; +class CudaHostAllocator : public RawAllocator { +public: + void* alloc(size_t size) override { + void* addr; + cudaError_t cuda_error = cudaHostAlloc(&addr, size, cudaHostAllocDefault); + if (cuda_error == cudaSuccess) { + mgb_assert(addr); + return addr; + } + auto msg = mgb_ssprintf_log( + "cudaHostAlloc failed while requesting %zd bytes (%.3fMiB)" + " of pinned host memory; error: %s", + size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error)); + msg.append(CudaError::get_cuda_extra_info()); + if (cuda_error == cudaErrorMemoryAllocation) { + mgb_log_error("%s", msg.c_str()); + // clear cuda error + cudaGetLastError(); + mgb_assert(cudaGetLastError() == cudaSuccess); + return nullptr; + } + mgb_throw_raw(MemAllocError{msg}); + } + + void free(void* ptr) override { + cudaError_t cuda_error = cudaFreeHost(ptr); + if (cuda_error == cudaSuccess) + return; + auto msg = ssprintf("cudaFreeHost failed for %p: %s", ptr, + cudaGetErrorString(cuda_error)); + msg.append(CudaError::get_cuda_extra_info()); + mgb_throw_raw(MemAllocError{msg}); + } + + void get_mem_info(size_t& free, size_t& tot) override { + free = 0; + tot = 0; + } +}; + class CudaDeviceRuntimePolicy : public DeviceRuntimePolicy { public: CompNode::DeviceType device_type() override { @@ -175,19 +215,9 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl { void free_device(void *ptr); - void *alloc_host(size_t size) override { - activate(); - void *ptr; - MGB_CUDA_CHECK(cudaMallocHost(&ptr, size)); - return ptr; - } + void *alloc_host(size_t size) override; - void free_host(void *ptr) { - if (!check_global_finalized()) { - activate(); - } - MGB_CUDA_CHECK(cudaFreeHost(ptr)); - } + void free_host(void *ptr); void copy_to_host(void *host_ptr, const void *device_ptr, size_t size) override { @@ -284,14 +314,18 @@ struct CudaCompNodeImpl::StaticData { mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; + std::unique_ptr host_alloc; CudaCompNode::CompNodeImpl node[MAX_NR_COMP_NODE]; DeviceInfo dev_info[MAX_NR_DEVICE]; int nr_node = 0, //!< number of loaded node[] nr_dev_used = 0; //!< number of used dev_info[] - StaticData() { + StaticData() : host_alloc( + mem_alloc::SimpleCachingAlloc::make( + std::make_unique())) { prealloc_config.max_overhead = 0; prealloc_config.alignment = 1; + host_alloc->alignment(1); } ~StaticData() { @@ -388,6 +422,18 @@ void CudaCompNodeImpl::free_device(void *ptr) { m_mem_alloc->free(ptr); } +void* CudaCompNodeImpl::alloc_host(size_t size) { + // no need for activate() here because under + // unified addressing, host memory can be accessed + // and freed on any device + return sd->host_alloc->alloc(size); +} + +void CudaCompNodeImpl::free_host(void* ptr) { + if (check_global_finalized()) return; + sd->host_alloc->free(ptr); +} + void CudaCompNodeImpl::peer_copy_to( Impl *dest_impl, void *dest, const void *src, size_t size) { if (dest_impl->same_type()) { diff --git a/src/core/impl/comp_node/mem_alloc/impl.cpp b/src/core/impl/comp_node/mem_alloc/impl.cpp index 64ef4001..7fdebc38 100644 --- a/src/core/impl/comp_node/mem_alloc/impl.cpp +++ b/src/core/impl/comp_node/mem_alloc/impl.cpp @@ -364,5 +364,57 @@ DevMemAllocImpl::~DevMemAllocImpl() { m_raw_allocator->free(i.first); } -// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} +std::unique_ptr SimpleCachingAlloc::make(std::unique_ptr raw_alloc) { + return std::make_unique(std::move(raw_alloc)); +} + +SimpleCachingAllocImpl::SimpleCachingAllocImpl(std::unique_ptr raw_alloc) + : m_raw_alloc(std::move(raw_alloc)) {} + +void* SimpleCachingAllocImpl::alloc(size_t size) { + size = get_aligned_power2(size, m_alignment); + auto&& addr = do_alloc(size, true); + auto ptr = addr.addr_ptr(); + MGB_LOCK_GUARD(m_mutex); + m_allocated_blocks[ptr] = {addr.is_head, size}; + m_used_size += size; + return ptr; +} + +void SimpleCachingAllocImpl::free(void* ptr) { + MGB_LOCK_GUARD(m_mutex); + auto&& iter = m_allocated_blocks.find(ptr); + mgb_assert(iter != m_allocated_blocks.end(), + "releasing bad pointer: %p", ptr); + auto size = iter->second.size; + FreeBlock fb{MemAddr{iter->second.is_head, reinterpret_cast(ptr)}, size}; + m_allocated_blocks.erase(iter); + merge_free_unsafe(fb); + m_used_size -= size; +} +SimpleCachingAllocImpl::~SimpleCachingAllocImpl() { + for (auto&& ptr_size : m_alloc_from_raw) { + m_raw_alloc->free(ptr_size.first); + } +} + +SimpleCachingAllocImpl::MemAddr SimpleCachingAllocImpl::alloc_from_parent(size_t size) { + void* ptr = m_raw_alloc->alloc(size); + m_alloc_from_raw[ptr] = size; + return {true, reinterpret_cast(ptr)}; +} + +std::string SimpleCachingAllocImpl::get_name() const { + return "SimpleCachingAllocImpl"; +} + +size_t SimpleCachingAllocImpl::get_used_memory() { + return m_used_size; +} + +FreeMemStat SimpleCachingAllocImpl::get_free_memory_dev() { + return get_free_memory(); +} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/core/impl/comp_node/mem_alloc/impl.h b/src/core/impl/comp_node/mem_alloc/impl.h index 46a35d87..73c2c0b0 100644 --- a/src/core/impl/comp_node/mem_alloc/impl.h +++ b/src/core/impl/comp_node/mem_alloc/impl.h @@ -211,7 +211,32 @@ public: FreeMemStat get_free_memory_dev() override; }; +class SimpleCachingAllocImpl : public SimpleCachingAlloc, + public MemAllocImplHelper { + struct AllocatedBlock { + bool is_head; + size_t size; + }; + + std::unique_ptr m_raw_alloc; + std::unordered_map m_alloc_from_raw; + std::unordered_map m_allocated_blocks; + size_t m_used_size = 0; + +public: + SimpleCachingAllocImpl(std::unique_ptr m_raw_alloc); + ~SimpleCachingAllocImpl(); + + void* alloc(size_t size) override; + void free(void* ptr) override; + size_t get_used_memory() override; + FreeMemStat get_free_memory_dev() override; + +protected: + MemAddr alloc_from_parent(size_t size) override; + std::string get_name() const override; +}; + } } // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} - diff --git a/src/core/include/megbrain/comp_node/alloc.h b/src/core/include/megbrain/comp_node/alloc.h index 9d7e51f0..e546e440 100644 --- a/src/core/include/megbrain/comp_node/alloc.h +++ b/src/core/include/megbrain/comp_node/alloc.h @@ -341,6 +341,32 @@ public: FwdDevMemAlloc(const std::shared_ptr& ra) : m_raw_alloc(ra) {} }; +/* ===================== SimpleCachingAlloc ===================== */ +/*! + * \brief An allocator that cache allocations to reduce call to raw allocator. + * Mainly used for CUDA pinned memory. + */ +class SimpleCachingAlloc : virtual public MemAllocBase { +protected: + size_t m_alignment = 1; + +public: + virtual ~SimpleCachingAlloc() = default; + static std::unique_ptr make(std::unique_ptr raw_alloc); + + virtual void* alloc(size_t size) = 0; + virtual void free(void* ptr) = 0; + + SimpleCachingAlloc& alignment(size_t alignment) { + m_alignment = alignment; + return *this; + }; + + size_t alignment() const { + return m_alignment; + }; +}; + } // mem_alloc } // mgb diff --git a/src/core/test/mem_alloc.cpp b/src/core/test/mem_alloc.cpp index 83b07ae3..b9e87341 100644 --- a/src/core/test/mem_alloc.cpp +++ b/src/core/test/mem_alloc.cpp @@ -440,6 +440,54 @@ TEST(TestMemAlloc, RandomOprs) { ASSERT_EQ(dummy_alloc->nr_alloc(), dummy_alloc->nr_free()); } +TEST(TestSimpleCachingAlloc, Basic) { + constexpr size_t TOT = 2048, REQ = 1000; + static_assert(TOT > REQ * 2, ""); + auto raw_alloc = new DummyAllocator(TOT); + auto alloc = SimpleCachingAlloc::make(std::unique_ptr(raw_alloc)); + + auto ptr = alloc->alloc(REQ); + EXPECT_EQ(TOT - REQ, raw_alloc->free_size()); + EXPECT_EQ(REQ, alloc->get_used_memory()); + EXPECT_EQ(0u, alloc->get_free_memory().tot); + + alloc->free(ptr); + EXPECT_EQ(0u, raw_alloc->nr_free()); + EXPECT_EQ(REQ, alloc->get_free_memory().tot); + + ptr = alloc->alloc(REQ / 2); + EXPECT_EQ(1u, raw_alloc->nr_alloc()); + EXPECT_EQ(REQ / 2, alloc->get_used_memory()); + EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); + + auto ptr2 = alloc->alloc(REQ / 2); + EXPECT_EQ(1u, raw_alloc->nr_alloc()); + EXPECT_EQ(REQ / 2 * 2, alloc->get_used_memory()); + EXPECT_EQ(REQ - REQ / 2 * 2, alloc->get_free_memory().tot); + EXPECT_EQ(REQ / 2, (char*)ptr2 - (char*)ptr); + + alloc->free(ptr); + EXPECT_EQ(1u, raw_alloc->nr_alloc()); + EXPECT_EQ(REQ / 2, alloc->get_used_memory()); + EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); + + ptr = alloc->alloc(REQ); + EXPECT_EQ(2u, raw_alloc->nr_alloc()); + EXPECT_EQ(TOT - REQ * 2, raw_alloc->free_size()); + EXPECT_EQ(REQ + REQ / 2, alloc->get_used_memory()); + EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); + + alloc->free(ptr2); + ptr2 = alloc->alloc(REQ); + EXPECT_EQ(2u, raw_alloc->nr_alloc()); + EXPECT_EQ(REQ * 2, alloc->get_used_memory()); + EXPECT_EQ(0u, alloc->get_free_memory().tot); + + alloc->free(ptr); + alloc->free(ptr2); + EXPECT_EQ(0u, raw_alloc->nr_free()); +}; + namespace { class DevicePolicy { public: