test(mgb): add SimpleCachingAlloc test
GitOrigin-RevId: 17f381e4ac
tags/v1.0.0-rc1
@@ -99,6 +99,46 @@ public: | |||
} | |||
}; | |||
class CudaHostAllocator : public RawAllocator { | |||
public: | |||
void* alloc(size_t size) override { | |||
void* addr; | |||
cudaError_t cuda_error = cudaHostAlloc(&addr, size, cudaHostAllocDefault); | |||
if (cuda_error == cudaSuccess) { | |||
mgb_assert(addr); | |||
return addr; | |||
} | |||
auto msg = mgb_ssprintf_log( | |||
"cudaHostAlloc failed while requesting %zd bytes (%.3fMiB)" | |||
" of pinned host memory; error: %s", | |||
size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error)); | |||
msg.append(CudaError::get_cuda_extra_info()); | |||
if (cuda_error == cudaErrorMemoryAllocation) { | |||
mgb_log_error("%s", msg.c_str()); | |||
// clear cuda error | |||
cudaGetLastError(); | |||
mgb_assert(cudaGetLastError() == cudaSuccess); | |||
return nullptr; | |||
} | |||
mgb_throw_raw(MemAllocError{msg}); | |||
} | |||
void free(void* ptr) override { | |||
cudaError_t cuda_error = cudaFreeHost(ptr); | |||
if (cuda_error == cudaSuccess) | |||
return; | |||
auto msg = ssprintf("cudaFreeHost failed for %p: %s", ptr, | |||
cudaGetErrorString(cuda_error)); | |||
msg.append(CudaError::get_cuda_extra_info()); | |||
mgb_throw_raw(MemAllocError{msg}); | |||
} | |||
void get_mem_info(size_t& free, size_t& tot) override { | |||
free = 0; | |||
tot = 0; | |||
} | |||
}; | |||
class CudaDeviceRuntimePolicy : public DeviceRuntimePolicy { | |||
public: | |||
CompNode::DeviceType device_type() override { | |||
@@ -175,19 +215,9 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl { | |||
void free_device(void *ptr); | |||
void *alloc_host(size_t size) override { | |||
activate(); | |||
void *ptr; | |||
MGB_CUDA_CHECK(cudaMallocHost(&ptr, size)); | |||
return ptr; | |||
} | |||
void *alloc_host(size_t size) override; | |||
void free_host(void *ptr) { | |||
if (!check_global_finalized()) { | |||
activate(); | |||
} | |||
MGB_CUDA_CHECK(cudaFreeHost(ptr)); | |||
} | |||
void free_host(void *ptr); | |||
void copy_to_host(void *host_ptr, | |||
const void *device_ptr, size_t size) override { | |||
@@ -284,14 +314,18 @@ struct CudaCompNodeImpl::StaticData { | |||
mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; | |||
std::unique_ptr<mem_alloc::SimpleCachingAlloc> host_alloc; | |||
CudaCompNode::CompNodeImpl node[MAX_NR_COMP_NODE]; | |||
DeviceInfo dev_info[MAX_NR_DEVICE]; | |||
int nr_node = 0, //!< number of loaded node[] | |||
nr_dev_used = 0; //!< number of used dev_info[] | |||
StaticData() { | |||
StaticData() : host_alloc( | |||
mem_alloc::SimpleCachingAlloc::make( | |||
std::make_unique<mem_alloc::CudaHostAllocator>())) { | |||
prealloc_config.max_overhead = 0; | |||
prealloc_config.alignment = 1; | |||
host_alloc->alignment(1); | |||
} | |||
~StaticData() { | |||
@@ -388,6 +422,18 @@ void CudaCompNodeImpl::free_device(void *ptr) { | |||
m_mem_alloc->free(ptr); | |||
} | |||
void* CudaCompNodeImpl::alloc_host(size_t size) { | |||
// no need for activate() here because under | |||
// unified addressing, host memory can be accessed | |||
// and freed on any device | |||
return sd->host_alloc->alloc(size); | |||
} | |||
void CudaCompNodeImpl::free_host(void* ptr) { | |||
if (check_global_finalized()) return; | |||
sd->host_alloc->free(ptr); | |||
} | |||
void CudaCompNodeImpl::peer_copy_to( | |||
Impl *dest_impl, void *dest, const void *src, size_t size) { | |||
if (dest_impl->same_type<CudaCompNodeImpl>()) { | |||
@@ -364,5 +364,57 @@ DevMemAllocImpl::~DevMemAllocImpl() { | |||
m_raw_allocator->free(i.first); | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
std::unique_ptr<SimpleCachingAlloc> SimpleCachingAlloc::make(std::unique_ptr<RawAllocator> raw_alloc) { | |||
return std::make_unique<SimpleCachingAllocImpl>(std::move(raw_alloc)); | |||
} | |||
SimpleCachingAllocImpl::SimpleCachingAllocImpl(std::unique_ptr<RawAllocator> raw_alloc) | |||
: m_raw_alloc(std::move(raw_alloc)) {} | |||
void* SimpleCachingAllocImpl::alloc(size_t size) { | |||
size = get_aligned_power2(size, m_alignment); | |||
auto&& addr = do_alloc(size, true); | |||
auto ptr = addr.addr_ptr(); | |||
MGB_LOCK_GUARD(m_mutex); | |||
m_allocated_blocks[ptr] = {addr.is_head, size}; | |||
m_used_size += size; | |||
return ptr; | |||
} | |||
void SimpleCachingAllocImpl::free(void* ptr) { | |||
MGB_LOCK_GUARD(m_mutex); | |||
auto&& iter = m_allocated_blocks.find(ptr); | |||
mgb_assert(iter != m_allocated_blocks.end(), | |||
"releasing bad pointer: %p", ptr); | |||
auto size = iter->second.size; | |||
FreeBlock fb{MemAddr{iter->second.is_head, reinterpret_cast<size_t>(ptr)}, size}; | |||
m_allocated_blocks.erase(iter); | |||
merge_free_unsafe(fb); | |||
m_used_size -= size; | |||
} | |||
SimpleCachingAllocImpl::~SimpleCachingAllocImpl() { | |||
for (auto&& ptr_size : m_alloc_from_raw) { | |||
m_raw_alloc->free(ptr_size.first); | |||
} | |||
} | |||
SimpleCachingAllocImpl::MemAddr SimpleCachingAllocImpl::alloc_from_parent(size_t size) { | |||
void* ptr = m_raw_alloc->alloc(size); | |||
m_alloc_from_raw[ptr] = size; | |||
return {true, reinterpret_cast<size_t>(ptr)}; | |||
} | |||
std::string SimpleCachingAllocImpl::get_name() const { | |||
return "SimpleCachingAllocImpl"; | |||
} | |||
size_t SimpleCachingAllocImpl::get_used_memory() { | |||
return m_used_size; | |||
} | |||
FreeMemStat SimpleCachingAllocImpl::get_free_memory_dev() { | |||
return get_free_memory(); | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -211,7 +211,32 @@ public: | |||
FreeMemStat get_free_memory_dev() override; | |||
}; | |||
class SimpleCachingAllocImpl : public SimpleCachingAlloc, | |||
public MemAllocImplHelper { | |||
struct AllocatedBlock { | |||
bool is_head; | |||
size_t size; | |||
}; | |||
std::unique_ptr<RawAllocator> m_raw_alloc; | |||
std::unordered_map<void*, size_t> m_alloc_from_raw; | |||
std::unordered_map<void*, AllocatedBlock> m_allocated_blocks; | |||
size_t m_used_size = 0; | |||
public: | |||
SimpleCachingAllocImpl(std::unique_ptr<RawAllocator> m_raw_alloc); | |||
~SimpleCachingAllocImpl(); | |||
void* alloc(size_t size) override; | |||
void free(void* ptr) override; | |||
size_t get_used_memory() override; | |||
FreeMemStat get_free_memory_dev() override; | |||
protected: | |||
MemAddr alloc_from_parent(size_t size) override; | |||
std::string get_name() const override; | |||
}; | |||
} | |||
} | |||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||
@@ -341,6 +341,32 @@ public: | |||
FwdDevMemAlloc(const std::shared_ptr<RawAllocator>& ra) : m_raw_alloc(ra) {} | |||
}; | |||
/* ===================== SimpleCachingAlloc ===================== */ | |||
/*! | |||
* \brief An allocator that cache allocations to reduce call to raw allocator. | |||
* Mainly used for CUDA pinned memory. | |||
*/ | |||
class SimpleCachingAlloc : virtual public MemAllocBase { | |||
protected: | |||
size_t m_alignment = 1; | |||
public: | |||
virtual ~SimpleCachingAlloc() = default; | |||
static std::unique_ptr<SimpleCachingAlloc> make(std::unique_ptr<RawAllocator> raw_alloc); | |||
virtual void* alloc(size_t size) = 0; | |||
virtual void free(void* ptr) = 0; | |||
SimpleCachingAlloc& alignment(size_t alignment) { | |||
m_alignment = alignment; | |||
return *this; | |||
}; | |||
size_t alignment() const { | |||
return m_alignment; | |||
}; | |||
}; | |||
} // mem_alloc | |||
} // mgb | |||
@@ -440,6 +440,54 @@ TEST(TestMemAlloc, RandomOprs) { | |||
ASSERT_EQ(dummy_alloc->nr_alloc(), dummy_alloc->nr_free()); | |||
} | |||
TEST(TestSimpleCachingAlloc, Basic) { | |||
constexpr size_t TOT = 2048, REQ = 1000; | |||
static_assert(TOT > REQ * 2, ""); | |||
auto raw_alloc = new DummyAllocator(TOT); | |||
auto alloc = SimpleCachingAlloc::make(std::unique_ptr<RawAllocator>(raw_alloc)); | |||
auto ptr = alloc->alloc(REQ); | |||
EXPECT_EQ(TOT - REQ, raw_alloc->free_size()); | |||
EXPECT_EQ(REQ, alloc->get_used_memory()); | |||
EXPECT_EQ(0u, alloc->get_free_memory().tot); | |||
alloc->free(ptr); | |||
EXPECT_EQ(0u, raw_alloc->nr_free()); | |||
EXPECT_EQ(REQ, alloc->get_free_memory().tot); | |||
ptr = alloc->alloc(REQ / 2); | |||
EXPECT_EQ(1u, raw_alloc->nr_alloc()); | |||
EXPECT_EQ(REQ / 2, alloc->get_used_memory()); | |||
EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); | |||
auto ptr2 = alloc->alloc(REQ / 2); | |||
EXPECT_EQ(1u, raw_alloc->nr_alloc()); | |||
EXPECT_EQ(REQ / 2 * 2, alloc->get_used_memory()); | |||
EXPECT_EQ(REQ - REQ / 2 * 2, alloc->get_free_memory().tot); | |||
EXPECT_EQ(REQ / 2, (char*)ptr2 - (char*)ptr); | |||
alloc->free(ptr); | |||
EXPECT_EQ(1u, raw_alloc->nr_alloc()); | |||
EXPECT_EQ(REQ / 2, alloc->get_used_memory()); | |||
EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); | |||
ptr = alloc->alloc(REQ); | |||
EXPECT_EQ(2u, raw_alloc->nr_alloc()); | |||
EXPECT_EQ(TOT - REQ * 2, raw_alloc->free_size()); | |||
EXPECT_EQ(REQ + REQ / 2, alloc->get_used_memory()); | |||
EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); | |||
alloc->free(ptr2); | |||
ptr2 = alloc->alloc(REQ); | |||
EXPECT_EQ(2u, raw_alloc->nr_alloc()); | |||
EXPECT_EQ(REQ * 2, alloc->get_used_memory()); | |||
EXPECT_EQ(0u, alloc->get_free_memory().tot); | |||
alloc->free(ptr); | |||
alloc->free(ptr2); | |||
EXPECT_EQ(0u, raw_alloc->nr_free()); | |||
}; | |||
namespace { | |||
class DevicePolicy { | |||
public: | |||