Browse Source

perf(mgb): add CUDA host memory allocator

test(mgb): add SimpleCachingAlloc test

GitOrigin-RevId: 17f381e4ac
tags/v1.0.0-rc1
Megvii Engine Team 4 years ago
parent
commit
b43fb1a97c
5 changed files with 212 additions and 15 deletions
  1. +59
    -13
      src/core/impl/comp_node/cuda/comp_node.cpp
  2. +53
    -1
      src/core/impl/comp_node/mem_alloc/impl.cpp
  3. +26
    -1
      src/core/impl/comp_node/mem_alloc/impl.h
  4. +26
    -0
      src/core/include/megbrain/comp_node/alloc.h
  5. +48
    -0
      src/core/test/mem_alloc.cpp

+ 59
- 13
src/core/impl/comp_node/cuda/comp_node.cpp View File

@@ -99,6 +99,46 @@ public:
}
};

class CudaHostAllocator : public RawAllocator {
public:
void* alloc(size_t size) override {
void* addr;
cudaError_t cuda_error = cudaHostAlloc(&addr, size, cudaHostAllocDefault);
if (cuda_error == cudaSuccess) {
mgb_assert(addr);
return addr;
}
auto msg = mgb_ssprintf_log(
"cudaHostAlloc failed while requesting %zd bytes (%.3fMiB)"
" of pinned host memory; error: %s",
size, size / (1024.0 * 1024), cudaGetErrorString(cuda_error));
msg.append(CudaError::get_cuda_extra_info());
if (cuda_error == cudaErrorMemoryAllocation) {
mgb_log_error("%s", msg.c_str());
// clear cuda error
cudaGetLastError();
mgb_assert(cudaGetLastError() == cudaSuccess);
return nullptr;
}
mgb_throw_raw(MemAllocError{msg});
}

void free(void* ptr) override {
cudaError_t cuda_error = cudaFreeHost(ptr);
if (cuda_error == cudaSuccess)
return;
auto msg = ssprintf("cudaFreeHost failed for %p: %s", ptr,
cudaGetErrorString(cuda_error));
msg.append(CudaError::get_cuda_extra_info());
mgb_throw_raw(MemAllocError{msg});
}

void get_mem_info(size_t& free, size_t& tot) override {
free = 0;
tot = 0;
}
};

class CudaDeviceRuntimePolicy : public DeviceRuntimePolicy {
public:
CompNode::DeviceType device_type() override {
@@ -175,19 +215,9 @@ class CudaCompNode::CompNodeImpl final: public CompNode::Impl {

void free_device(void *ptr);

void *alloc_host(size_t size) override {
activate();
void *ptr;
MGB_CUDA_CHECK(cudaMallocHost(&ptr, size));
return ptr;
}
void *alloc_host(size_t size) override;

void free_host(void *ptr) {
if (!check_global_finalized()) {
activate();
}
MGB_CUDA_CHECK(cudaFreeHost(ptr));
}
void free_host(void *ptr);

void copy_to_host(void *host_ptr,
const void *device_ptr, size_t size) override {
@@ -284,14 +314,18 @@ struct CudaCompNodeImpl::StaticData {

mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config;

std::unique_ptr<mem_alloc::SimpleCachingAlloc> host_alloc;
CudaCompNode::CompNodeImpl node[MAX_NR_COMP_NODE];
DeviceInfo dev_info[MAX_NR_DEVICE];
int nr_node = 0, //!< number of loaded node[]
nr_dev_used = 0; //!< number of used dev_info[]

StaticData() {
StaticData() : host_alloc(
mem_alloc::SimpleCachingAlloc::make(
std::make_unique<mem_alloc::CudaHostAllocator>())) {
prealloc_config.max_overhead = 0;
prealloc_config.alignment = 1;
host_alloc->alignment(1);
}

~StaticData() {
@@ -388,6 +422,18 @@ void CudaCompNodeImpl::free_device(void *ptr) {
m_mem_alloc->free(ptr);
}

void* CudaCompNodeImpl::alloc_host(size_t size) {
// no need for activate() here because under
// unified addressing, host memory can be accessed
// and freed on any device
return sd->host_alloc->alloc(size);
}

void CudaCompNodeImpl::free_host(void* ptr) {
if (check_global_finalized()) return;
sd->host_alloc->free(ptr);
}

void CudaCompNodeImpl::peer_copy_to(
Impl *dest_impl, void *dest, const void *src, size_t size) {
if (dest_impl->same_type<CudaCompNodeImpl>()) {


+ 53
- 1
src/core/impl/comp_node/mem_alloc/impl.cpp View File

@@ -364,5 +364,57 @@ DevMemAllocImpl::~DevMemAllocImpl() {
m_raw_allocator->free(i.first);
}

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
std::unique_ptr<SimpleCachingAlloc> SimpleCachingAlloc::make(std::unique_ptr<RawAllocator> raw_alloc) {
return std::make_unique<SimpleCachingAllocImpl>(std::move(raw_alloc));
}

SimpleCachingAllocImpl::SimpleCachingAllocImpl(std::unique_ptr<RawAllocator> raw_alloc)
: m_raw_alloc(std::move(raw_alloc)) {}

void* SimpleCachingAllocImpl::alloc(size_t size) {
size = get_aligned_power2(size, m_alignment);
auto&& addr = do_alloc(size, true);
auto ptr = addr.addr_ptr();
MGB_LOCK_GUARD(m_mutex);
m_allocated_blocks[ptr] = {addr.is_head, size};
m_used_size += size;
return ptr;
}

void SimpleCachingAllocImpl::free(void* ptr) {
MGB_LOCK_GUARD(m_mutex);
auto&& iter = m_allocated_blocks.find(ptr);
mgb_assert(iter != m_allocated_blocks.end(),
"releasing bad pointer: %p", ptr);
auto size = iter->second.size;
FreeBlock fb{MemAddr{iter->second.is_head, reinterpret_cast<size_t>(ptr)}, size};
m_allocated_blocks.erase(iter);
merge_free_unsafe(fb);
m_used_size -= size;
}

SimpleCachingAllocImpl::~SimpleCachingAllocImpl() {
for (auto&& ptr_size : m_alloc_from_raw) {
m_raw_alloc->free(ptr_size.first);
}
}

SimpleCachingAllocImpl::MemAddr SimpleCachingAllocImpl::alloc_from_parent(size_t size) {
void* ptr = m_raw_alloc->alloc(size);
m_alloc_from_raw[ptr] = size;
return {true, reinterpret_cast<size_t>(ptr)};
}

std::string SimpleCachingAllocImpl::get_name() const {
return "SimpleCachingAllocImpl";
}

size_t SimpleCachingAllocImpl::get_used_memory() {
return m_used_size;
}

FreeMemStat SimpleCachingAllocImpl::get_free_memory_dev() {
return get_free_memory();
}

// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

+ 26
- 1
src/core/impl/comp_node/mem_alloc/impl.h View File

@@ -211,7 +211,32 @@ public:
FreeMemStat get_free_memory_dev() override;
};

class SimpleCachingAllocImpl : public SimpleCachingAlloc,
public MemAllocImplHelper {
struct AllocatedBlock {
bool is_head;
size_t size;
};

std::unique_ptr<RawAllocator> m_raw_alloc;
std::unordered_map<void*, size_t> m_alloc_from_raw;
std::unordered_map<void*, AllocatedBlock> m_allocated_blocks;
size_t m_used_size = 0;

public:
SimpleCachingAllocImpl(std::unique_ptr<RawAllocator> m_raw_alloc);
~SimpleCachingAllocImpl();

void* alloc(size_t size) override;
void free(void* ptr) override;
size_t get_used_memory() override;
FreeMemStat get_free_memory_dev() override;

protected:
MemAddr alloc_from_parent(size_t size) override;
std::string get_name() const override;
};

}
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}


+ 26
- 0
src/core/include/megbrain/comp_node/alloc.h View File

@@ -341,6 +341,32 @@ public:
FwdDevMemAlloc(const std::shared_ptr<RawAllocator>& ra) : m_raw_alloc(ra) {}
};

/* ===================== SimpleCachingAlloc ===================== */
/*!
* \brief An allocator that cache allocations to reduce call to raw allocator.
* Mainly used for CUDA pinned memory.
*/
class SimpleCachingAlloc : virtual public MemAllocBase {
protected:
size_t m_alignment = 1;

public:
virtual ~SimpleCachingAlloc() = default;
static std::unique_ptr<SimpleCachingAlloc> make(std::unique_ptr<RawAllocator> raw_alloc);

virtual void* alloc(size_t size) = 0;
virtual void free(void* ptr) = 0;

SimpleCachingAlloc& alignment(size_t alignment) {
m_alignment = alignment;
return *this;
};

size_t alignment() const {
return m_alignment;
};
};

} // mem_alloc
} // mgb



+ 48
- 0
src/core/test/mem_alloc.cpp View File

@@ -440,6 +440,54 @@ TEST(TestMemAlloc, RandomOprs) {
ASSERT_EQ(dummy_alloc->nr_alloc(), dummy_alloc->nr_free());
}

TEST(TestSimpleCachingAlloc, Basic) {
constexpr size_t TOT = 2048, REQ = 1000;
static_assert(TOT > REQ * 2, "");
auto raw_alloc = new DummyAllocator(TOT);
auto alloc = SimpleCachingAlloc::make(std::unique_ptr<RawAllocator>(raw_alloc));

auto ptr = alloc->alloc(REQ);
EXPECT_EQ(TOT - REQ, raw_alloc->free_size());
EXPECT_EQ(REQ, alloc->get_used_memory());
EXPECT_EQ(0u, alloc->get_free_memory().tot);

alloc->free(ptr);
EXPECT_EQ(0u, raw_alloc->nr_free());
EXPECT_EQ(REQ, alloc->get_free_memory().tot);

ptr = alloc->alloc(REQ / 2);
EXPECT_EQ(1u, raw_alloc->nr_alloc());
EXPECT_EQ(REQ / 2, alloc->get_used_memory());
EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot);

auto ptr2 = alloc->alloc(REQ / 2);
EXPECT_EQ(1u, raw_alloc->nr_alloc());
EXPECT_EQ(REQ / 2 * 2, alloc->get_used_memory());
EXPECT_EQ(REQ - REQ / 2 * 2, alloc->get_free_memory().tot);
EXPECT_EQ(REQ / 2, (char*)ptr2 - (char*)ptr);

alloc->free(ptr);
EXPECT_EQ(1u, raw_alloc->nr_alloc());
EXPECT_EQ(REQ / 2, alloc->get_used_memory());
EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot);

ptr = alloc->alloc(REQ);
EXPECT_EQ(2u, raw_alloc->nr_alloc());
EXPECT_EQ(TOT - REQ * 2, raw_alloc->free_size());
EXPECT_EQ(REQ + REQ / 2, alloc->get_used_memory());
EXPECT_EQ(REQ - REQ / 2, alloc->get_free_memory().tot);

alloc->free(ptr2);
ptr2 = alloc->alloc(REQ);
EXPECT_EQ(2u, raw_alloc->nr_alloc());
EXPECT_EQ(REQ * 2, alloc->get_used_memory());
EXPECT_EQ(0u, alloc->get_free_memory().tot);

alloc->free(ptr);
alloc->free(ptr2);
EXPECT_EQ(0u, raw_alloc->nr_free());
};

namespace {
class DevicePolicy {
public:


Loading…
Cancel
Save