You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rdma_pool_allocator.cc 6.7 kB

4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "graph/manager/rdma_pool_allocator.h"
  17. #include <framework/common/debug/log.h>
  18. #include "framework/common/debug/ge_log.h"
  19. #include "graph/ge_context.h"
  20. #include "runtime/dev.h"
  21. namespace {
  22. const size_t kAlignedSize = 512;
  23. const float kSplitThreshold = 0.5;
  24. inline size_t GetAlignedBlockSize(size_t size) {
  25. if (size == 0) {
  26. return kAlignedSize;
  27. }
  28. return kAlignedSize * ((size + kAlignedSize - 1) / kAlignedSize);
  29. }
  30. inline bool ShouldSplit(const ge::Block *block, size_t size) {
  31. return static_cast<double>(size) <= (static_cast<double>(block->size) * kSplitThreshold);
  32. }
  33. inline bool CanMerge(ge::Block *block) { return block != nullptr && !block->allocated; }
  34. } // namespace
  35. namespace ge {
  36. RdmaPoolAllocator::RdmaPoolAllocator(rtMemType_t memory_type)
  37. : memory_type_(memory_type), block_bin_(BlockBin([](const Block *left, const Block *right) {
  38. if (left->size != right->size) {
  39. return left->size < right->size;
  40. }
  41. return reinterpret_cast<uintptr_t>(left->ptr) < reinterpret_cast<uintptr_t>(right->ptr);
  42. })) {}
  43. Status RdmaPoolAllocator::Initialize() {
  44. memory_allocator_ = MemManager::Instance(memory_type_);
  45. if (memory_allocator_ == nullptr) {
  46. return ge::FAILED;
  47. }
  48. return ge::SUCCESS;
  49. }
  50. void RdmaPoolAllocator::Finalize() {
  51. GELOGD("Rdma pool finalize start.");
  52. for (auto it = allocated_blocks_.begin(); it != allocated_blocks_.end();) {
  53. auto block = it->second;
  54. it = allocated_blocks_.erase(it);
  55. delete block;
  56. }
  57. for (auto it = block_bin_.begin(); it != block_bin_.end();) {
  58. auto block = *it;
  59. it = block_bin_.erase(it);
  60. delete block;
  61. }
  62. if (rdma_base_addr_ != nullptr) {
  63. GELOGD("Start to free rdma pool memory.");
  64. if (memory_allocator_->FreeMemory(rdma_base_addr_) != SUCCESS) {
  65. GELOGW("Free rdma pool memory failed");
  66. }
  67. rdma_base_addr_ = nullptr;
  68. }
  69. }
  70. Status RdmaPoolAllocator::InitMemory(size_t mem_size) {
  71. auto device_id = GetContext().DeviceId();
  72. GELOGD("Init Rdma Memory with size [%zu] for devid:[%u]", mem_size, device_id);
  73. if (rdma_base_addr_ != nullptr) {
  74. GELOGE(GE_MULTI_INIT, "Rdma pool has been malloced");
  75. return GE_MULTI_INIT;
  76. }
  77. const std::string purpose = "Memory for rdma pool.";
  78. std::lock_guard<std::recursive_mutex> lock(mutex_);
  79. auto dev_id = static_cast<int32_t>(device_id);
  80. GE_CHK_RT_RET(rtSetDevice(dev_id));
  81. // DeviceReset before memory finished!
  82. GE_MAKE_GUARD(not_used_var, [&] { GE_CHK_RT(rtDeviceReset(dev_id)); });
  83. rdma_base_addr_ = memory_allocator_->MallocMemory(purpose, mem_size, device_id);
  84. if (rdma_base_addr_ == nullptr) {
  85. GELOGE(GE_GRAPH_MALLOC_FAILED, "Rdma pool memory malloc failed");
  86. return GE_GRAPH_MALLOC_FAILED;
  87. }
  88. rdma_mem_size_ = mem_size;
  89. // Init with a base block.
  90. auto *base_block = new (std::nothrow) Block(device_id, mem_size, rdma_base_addr_);
  91. if (base_block == nullptr) {
  92. GELOGE(GE_GRAPH_MALLOC_FAILED, "Block malloc failed");
  93. return GE_GRAPH_MALLOC_FAILED;
  94. }
  95. block_bin_.insert(base_block);
  96. return SUCCESS;
  97. }
  98. uint8_t *RdmaPoolAllocator::Malloc(size_t size, uint32_t device_id) {
  99. GELOGI("start to malloc rdma memory size:%zu, device id = %u", size, device_id);
  100. auto aligned_size = GetAlignedBlockSize(size);
  101. Block key(device_id, aligned_size, nullptr);
  102. std::lock_guard<std::recursive_mutex> lock(mutex_);
  103. auto it = block_bin_.lower_bound(&key);
  104. if (it != block_bin_.end()) {
  105. Block *block = *it;
  106. block_bin_.erase(it);
  107. block->allocated = true;
  108. if (block->ptr == nullptr) {
  109. GELOGE(INTERNAL_ERROR, "Rdmapool memory address is nullptr.");
  110. return nullptr;
  111. }
  112. allocated_blocks_.emplace(block->ptr, block);
  113. if (ShouldSplit(block, aligned_size)) {
  114. GELOGD("Block will be splited block size = %zu, aligned_size:%zu", block->size, aligned_size);
  115. auto *new_block =
  116. new (std::nothrow) Block(device_id, block->size - aligned_size, nullptr, block->ptr + aligned_size);
  117. if (new_block == nullptr) {
  118. GELOGW("Block split failed");
  119. return block->ptr;
  120. }
  121. new_block->next = block->next;
  122. if (block->next != nullptr) {
  123. block->next->prev = new_block;
  124. }
  125. new_block->prev = block;
  126. block->next = new_block;
  127. block->size = aligned_size;
  128. block_bin_.insert(new_block);
  129. }
  130. GELOGD("Find block size = %zu", block->size);
  131. return block->ptr;
  132. }
  133. GELOGW("Memory block not founded.");
  134. return nullptr;
  135. }
  136. Status RdmaPoolAllocator::Free(uint8_t *memory_addr, uint32_t device_id) {
  137. GELOGI("Free rdma memory, device id = %u", device_id);
  138. if (memory_addr == nullptr) {
  139. GELOGE(GE_GRAPH_FREE_FAILED, "Invalid memory pointer");
  140. return GE_GRAPH_FREE_FAILED;
  141. }
  142. std::lock_guard<std::recursive_mutex> lock(mutex_);
  143. auto it = allocated_blocks_.find(memory_addr);
  144. if (it == allocated_blocks_.end()) {
  145. GELOGE(PARAM_INVALID, "Invalid memory pointer");
  146. return PARAM_INVALID;
  147. }
  148. Block *block = it->second;
  149. block->allocated = false;
  150. allocated_blocks_.erase(it);
  151. Block *merge_blocks[] = {block->prev, block->next};
  152. for (Block *merge_block : merge_blocks) {
  153. MergeBlocks(block, merge_block);
  154. }
  155. block_bin_.insert(block);
  156. return SUCCESS;
  157. }
  158. void RdmaPoolAllocator::MergeBlocks(Block *dst, Block *src) {
  159. if (!CanMerge(dst) || !CanMerge(src)) {
  160. return;
  161. }
  162. if (dst->prev == src) {
  163. dst->ptr = src->ptr;
  164. dst->prev = src->prev;
  165. if (dst->prev != nullptr) {
  166. dst->prev->next = dst;
  167. }
  168. } else {
  169. dst->next = src->next;
  170. if (dst->next != nullptr) {
  171. dst->next->prev = dst;
  172. }
  173. }
  174. dst->size += src->size;
  175. block_bin_.erase(src);
  176. delete src;
  177. }
  178. Status RdmaPoolAllocator::GetBaseAddr(uint64_t &base_addr, uint64_t &mem_size) {
  179. if (rdma_base_addr_ == nullptr) {
  180. GELOGE(INTERNAL_ERROR, "Rdma base addr is nullptr.");
  181. return INTERNAL_ERROR;
  182. }
  183. base_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(rdma_base_addr_));
  184. mem_size = rdma_mem_size_;
  185. return SUCCESS;
  186. }
  187. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示