test(mgb): add SimpleCachingAlloc test
GitOrigin-RevId: 17f381e4ac
tags/v1.0.0-rc1
@@ -99,6 +99,46 @@ public: | |||||
} | } | ||||
}; | }; | ||||
class CudaHostAllocator : public RawAllocator { | |||||
public: | |||||
void* alloc(size_t size) override { | |||||
void* addr; | |||||
cudaError_t cuda_error = cudaHostAlloc(&addr, size, cudaHostAllocDefault); | |||||
if (cuda_error == cudaSuccess) { | |||||
mgb_assert(addr); | |||||
return addr; | |||||
} | |||||
auto msg = mgb_ssprintf_log( | |||||
"cudaHostAlloc failed while requesting %zd bytes (%.3fMiB)" | |||||
" of pinned host memory; error: %s", | |||||
size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error)); | |||||
msg.append(CudaError::get_cuda_extra_info()); | |||||
if (cuda_error == cudaErrorMemoryAllocation) { | |||||
mgb_log_error("%s", msg.c_str()); | |||||
// clear cuda error | |||||
cudaGetLastError(); | |||||
mgb_assert(cudaGetLastError() == cudaSuccess); | |||||
return nullptr; | |||||
} | |||||
mgb_throw_raw(MemAllocError{msg}); | |||||
} | |||||
void free(void* ptr) override { | |||||
cudaError_t cuda_error = cudaFreeHost(ptr); | |||||
if (cuda_error == cudaSuccess) | |||||
return; | |||||
auto msg = ssprintf("cudaFreeHost failed for %p: %s", ptr, | |||||
cudaGetErrorString(cuda_error)); | |||||
msg.append(CudaError::get_cuda_extra_info()); | |||||
mgb_throw_raw(MemAllocError{msg}); | |||||
} | |||||
void get_mem_info(size_t& free, size_t& tot) override { | |||||
free = 0; | |||||
tot = 0; | |||||
} | |||||
}; | |||||
class CudaDeviceRuntimePolicy : public DeviceRuntimePolicy { | class CudaDeviceRuntimePolicy : public DeviceRuntimePolicy { | ||||
public: | public: | ||||
CompNode::DeviceType device_type() override { | CompNode::DeviceType device_type() override { | ||||
@@ -175,19 +215,9 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl { | |||||
void free_device(void *ptr); | void free_device(void *ptr); | ||||
void *alloc_host(size_t size) override { | |||||
activate(); | |||||
void *ptr; | |||||
MGB_CUDA_CHECK(cudaMallocHost(&ptr, size)); | |||||
return ptr; | |||||
} | |||||
void *alloc_host(size_t size) override; | |||||
void free_host(void *ptr) { | |||||
if (!check_global_finalized()) { | |||||
activate(); | |||||
} | |||||
MGB_CUDA_CHECK(cudaFreeHost(ptr)); | |||||
} | |||||
void free_host(void *ptr); | |||||
void copy_to_host(void *host_ptr, | void copy_to_host(void *host_ptr, | ||||
const void *device_ptr, size_t size) override { | const void *device_ptr, size_t size) override { | ||||
@@ -284,14 +314,18 @@ struct CudaCompNodeImpl::StaticData { | |||||
mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; | mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; | ||||
std::unique_ptr<mem_alloc::SimpleCachingAlloc> host_alloc; | |||||
CudaCompNode::CompNodeImpl node[MAX_NR_COMP_NODE]; | CudaCompNode::CompNodeImpl node[MAX_NR_COMP_NODE]; | ||||
DeviceInfo dev_info[MAX_NR_DEVICE]; | DeviceInfo dev_info[MAX_NR_DEVICE]; | ||||
int nr_node = 0, //!< number of loaded node[] | int nr_node = 0, //!< number of loaded node[] | ||||
nr_dev_used = 0; //!< number of used dev_info[] | nr_dev_used = 0; //!< number of used dev_info[] | ||||
StaticData() { | |||||
StaticData() : host_alloc( | |||||
mem_alloc::SimpleCachingAlloc::make( | |||||
std::make_unique<mem_alloc::CudaHostAllocator>())) { | |||||
prealloc_config.max_overhead = 0; | prealloc_config.max_overhead = 0; | ||||
prealloc_config.alignment = 1; | prealloc_config.alignment = 1; | ||||
host_alloc->alignment(1); | |||||
} | } | ||||
~StaticData() { | ~StaticData() { | ||||
@@ -388,6 +422,18 @@ void CudaCompNodeImpl::free_device(void *ptr) { | |||||
m_mem_alloc->free(ptr); | m_mem_alloc->free(ptr); | ||||
} | } | ||||
void* CudaCompNodeImpl::alloc_host(size_t size) { | |||||
// no need for activate() here because under | |||||
// unified addressing, host memory can be accessed | |||||
// and freed on any device | |||||
return sd->host_alloc->alloc(size); | |||||
} | |||||
void CudaCompNodeImpl::free_host(void* ptr) { | |||||
if (check_global_finalized()) return; | |||||
sd->host_alloc->free(ptr); | |||||
} | |||||
void CudaCompNodeImpl::peer_copy_to( | void CudaCompNodeImpl::peer_copy_to( | ||||
Impl *dest_impl, void *dest, const void *src, size_t size) { | Impl *dest_impl, void *dest, const void *src, size_t size) { | ||||
if (dest_impl->same_type<CudaCompNodeImpl>()) { | if (dest_impl->same_type<CudaCompNodeImpl>()) { | ||||
@@ -364,5 +364,57 @@ DevMemAllocImpl::~DevMemAllocImpl() { | |||||
m_raw_allocator->free(i.first); | m_raw_allocator->free(i.first); | ||||
} | } | ||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | |||||
std::unique_ptr<SimpleCachingAlloc> SimpleCachingAlloc::make(std::unique_ptr<RawAllocator> raw_alloc) { | |||||
return std::make_unique<SimpleCachingAllocImpl>(std::move(raw_alloc)); | |||||
} | |||||
SimpleCachingAllocImpl::SimpleCachingAllocImpl(std::unique_ptr<RawAllocator> raw_alloc) | |||||
: m_raw_alloc(std::move(raw_alloc)) {} | |||||
void* SimpleCachingAllocImpl::alloc(size_t size) { | |||||
size = get_aligned_power2(size, m_alignment); | |||||
auto&& addr = do_alloc(size, true); | |||||
auto ptr = addr.addr_ptr(); | |||||
MGB_LOCK_GUARD(m_mutex); | |||||
m_allocated_blocks[ptr] = {addr.is_head, size}; | |||||
m_used_size += size; | |||||
return ptr; | |||||
} | |||||
void SimpleCachingAllocImpl::free(void* ptr) { | |||||
MGB_LOCK_GUARD(m_mutex); | |||||
auto&& iter = m_allocated_blocks.find(ptr); | |||||
mgb_assert(iter != m_allocated_blocks.end(), | |||||
"releasing bad pointer: %p", ptr); | |||||
auto size = iter->second.size; | |||||
FreeBlock fb{MemAddr{iter->second.is_head, reinterpret_cast<size_t>(ptr)}, size}; | |||||
m_allocated_blocks.erase(iter); | |||||
merge_free_unsafe(fb); | |||||
m_used_size -= size; | |||||
} | |||||
SimpleCachingAllocImpl::~SimpleCachingAllocImpl() { | |||||
for (auto&& ptr_size : m_alloc_from_raw) { | |||||
m_raw_alloc->free(ptr_size.first); | |||||
} | |||||
} | |||||
SimpleCachingAllocImpl::MemAddr SimpleCachingAllocImpl::alloc_from_parent(size_t size) { | |||||
void* ptr = m_raw_alloc->alloc(size); | |||||
m_alloc_from_raw[ptr] = size; | |||||
return {true, reinterpret_cast<size_t>(ptr)}; | |||||
} | |||||
std::string SimpleCachingAllocImpl::get_name() const { | |||||
return "SimpleCachingAllocImpl"; | |||||
} | |||||
size_t SimpleCachingAllocImpl::get_used_memory() { | |||||
return m_used_size; | |||||
} | |||||
FreeMemStat SimpleCachingAllocImpl::get_free_memory_dev() { | |||||
return get_free_memory(); | |||||
} | |||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} |
@@ -211,7 +211,32 @@ public: | |||||
FreeMemStat get_free_memory_dev() override; | FreeMemStat get_free_memory_dev() override; | ||||
}; | }; | ||||
class SimpleCachingAllocImpl : public SimpleCachingAlloc, | |||||
public MemAllocImplHelper { | |||||
struct AllocatedBlock { | |||||
bool is_head; | |||||
size_t size; | |||||
}; | |||||
std::unique_ptr<RawAllocator> m_raw_alloc; | |||||
std::unordered_map<void*, size_t> m_alloc_from_raw; | |||||
std::unordered_map<void*, AllocatedBlock> m_allocated_blocks; | |||||
size_t m_used_size = 0; | |||||
public: | |||||
SimpleCachingAllocImpl(std::unique_ptr<RawAllocator> m_raw_alloc); | |||||
~SimpleCachingAllocImpl(); | |||||
void* alloc(size_t size) override; | |||||
void free(void* ptr) override; | |||||
size_t get_used_memory() override; | |||||
FreeMemStat get_free_memory_dev() override; | |||||
protected: | |||||
MemAddr alloc_from_parent(size_t size) override; | |||||
std::string get_name() const override; | |||||
}; | |||||
} | } | ||||
} | } | ||||
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} | ||||
@@ -341,6 +341,32 @@ public: | |||||
FwdDevMemAlloc(const std::shared_ptr<RawAllocator>& ra) : m_raw_alloc(ra) {} | FwdDevMemAlloc(const std::shared_ptr<RawAllocator>& ra) : m_raw_alloc(ra) {} | ||||
}; | }; | ||||
/* ===================== SimpleCachingAlloc ===================== */ | |||||
/*! | |||||
* \brief An allocator that cache allocations to reduce call to raw allocator. | |||||
* Mainly used for CUDA pinned memory. | |||||
*/ | |||||
class SimpleCachingAlloc : virtual public MemAllocBase { | |||||
protected: | |||||
size_t m_alignment = 1; | |||||
public: | |||||
virtual ~SimpleCachingAlloc() = default; | |||||
static std::unique_ptr<SimpleCachingAlloc> make(std::unique_ptr<RawAllocator> raw_alloc); | |||||
virtual void* alloc(size_t size) = 0; | |||||
virtual void free(void* ptr) = 0; | |||||
SimpleCachingAlloc& alignment(size_t alignment) { | |||||
m_alignment = alignment; | |||||
return *this; | |||||
}; | |||||
size_t alignment() const { | |||||
return m_alignment; | |||||
}; | |||||
}; | |||||
} // mem_alloc | } // mem_alloc | ||||
} // mgb | } // mgb | ||||
@@ -440,6 +440,54 @@ TEST(TestMemAlloc, RandomOprs) { | |||||
ASSERT_EQ(dummy_alloc->nr_alloc(), dummy_alloc->nr_free()); | ASSERT_EQ(dummy_alloc->nr_alloc(), dummy_alloc->nr_free()); | ||||
} | } | ||||
TEST(TestSimpleCachingAlloc, Basic) { | |||||
constexpr size_t TOT = 2048, REQ = 1000; | |||||
static_assert(TOT > REQ * 2, ""); | |||||
auto raw_alloc = new DummyAllocator(TOT); | |||||
auto alloc = SimpleCachingAlloc::make(std::unique_ptr<RawAllocator>(raw_alloc)); | |||||
auto ptr = alloc->alloc(REQ); | |||||
EXPECT_EQ(TOT - REQ, raw_alloc->free_size()); | |||||
EXPECT_EQ(REQ, alloc->get_used_memory()); | |||||
EXPECT_EQ(0u, alloc->get_free_memory().tot); | |||||
alloc->free(ptr); | |||||
EXPECT_EQ(0u, raw_alloc->nr_free()); | |||||
EXPECT_EQ(REQ, alloc->get_free_memory().tot); | |||||
ptr = alloc->alloc(REQ / 2); | |||||
EXPECT_EQ(1u, raw_alloc->nr_alloc()); | |||||
EXPECT_EQ(REQ / 2, alloc->get_used_memory()); | |||||
EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); | |||||
auto ptr2 = alloc->alloc(REQ / 2); | |||||
EXPECT_EQ(1u, raw_alloc->nr_alloc()); | |||||
EXPECT_EQ(REQ / 2 * 2, alloc->get_used_memory()); | |||||
EXPECT_EQ(REQ - REQ / 2 * 2, alloc->get_free_memory().tot); | |||||
EXPECT_EQ(REQ / 2, (char*)ptr2 - (char*)ptr); | |||||
alloc->free(ptr); | |||||
EXPECT_EQ(1u, raw_alloc->nr_alloc()); | |||||
EXPECT_EQ(REQ / 2, alloc->get_used_memory()); | |||||
EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); | |||||
ptr = alloc->alloc(REQ); | |||||
EXPECT_EQ(2u, raw_alloc->nr_alloc()); | |||||
EXPECT_EQ(TOT - REQ * 2, raw_alloc->free_size()); | |||||
EXPECT_EQ(REQ + REQ / 2, alloc->get_used_memory()); | |||||
EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot); | |||||
alloc->free(ptr2); | |||||
ptr2 = alloc->alloc(REQ); | |||||
EXPECT_EQ(2u, raw_alloc->nr_alloc()); | |||||
EXPECT_EQ(REQ * 2, alloc->get_used_memory()); | |||||
EXPECT_EQ(0u, alloc->get_free_memory().tot); | |||||
alloc->free(ptr); | |||||
alloc->free(ptr2); | |||||
EXPECT_EQ(0u, raw_alloc->nr_free()); | |||||
}; | |||||
namespace { | namespace { | ||||
class DevicePolicy { | class DevicePolicy { | ||||
public: | public: | ||||