Browse Source

GeTensor aligned addr & zero copy support

tags/v1.2.0
chenyemeng 4 years ago
parent
commit
87021d3f20
16 changed files with 42 additions and 38 deletions
  1. +6
    -4
      ge/CMakeLists.txt
  2. +3
    -1
      ge/executor/CMakeLists.txt
  3. +1
    -1
      ge/ge_local_engine/engine/host_cpu_engine.cc
  4. +3
    -3
      ge/graph/manager/graph_manager.cc
  5. +4
    -4
      ge/graph/manager/graph_mem_allocator.cc
  6. +3
    -3
      ge/graph/manager/graph_mem_allocator.h
  7. +2
    -2
      ge/graph/manager/host_mem_manager.cc
  8. +1
    -1
      ge/graph/manager/host_mem_manager.h
  9. +1
    -1
      ge/graph/passes/assign_pass.cc
  10. +1
    -1
      ge/graph/passes/assign_pass.h
  11. +2
    -2
      ge/graph/passes/constant_fuse_same_pass.cc
  12. +3
    -3
      ge/graph/passes/constant_fuse_same_pass.h
  13. +3
    -3
      ge/graph/preprocess/graph_preprocess.cc
  14. +3
    -3
      ge/hybrid/common/npu_memory_allocator.cc
  15. +3
    -3
      ge/hybrid/model/hybrid_model_builder.cc
  16. +3
    -3
      ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc

+ 6
- 4
ge/CMakeLists.txt View File

@@ -125,7 +125,7 @@ set(TRAIN_SRC_LIST
"graph/manager/graph_var_manager.cc"
"graph/manager/host_mem_manager.cc"
"graph/manager/rdma_pool_allocator.cc"
$<$<STREQUAL:${ENABLE_OPEN_SRC},FALSE>:graph/manager/host_mem_allocator.cc>
$<$<NOT:$<STREQUAL:${ENABLE_OPEN_SRC},True>>:graph/manager/host_mem_allocator.cc>
"graph/manager/memory_api.cc"
"graph/manager/model_manager/event_manager.cc"
"graph/manager/trans_var_data_utils.cc"
@@ -167,7 +167,7 @@ set(TRAIN_SRC_LIST
"graph/passes/hccl_group_pass.cc"
"graph/passes/enter_pass.cc"
"graph/passes/assign_pass.cc"
$<$<STREQUAL:${ENABLE_OPEN_SRC},FALSE>:graph/passes/inplace_support_check_pass.cc>
$<$<NOT:$<STREQUAL:${ENABLE_OPEN_SRC},True>>:graph/passes/inplace_support_check_pass.cc>
"graph/passes/flow_ctrl_pass.cc"
"graph/passes/global_step_insert_pass.cc"
"host_kernels/transpose_kernel.cc"
@@ -403,7 +403,7 @@ set(INFER_SRC_LIST
"graph/manager/graph_var_manager.cc"
"graph/manager/host_mem_manager.cc"
"graph/manager/rdma_pool_allocator.cc"
$<$<STREQUAL:${ENABLE_OPEN_SRC},FALSE>:graph/manager/host_mem_allocator.cc>
$<$<NOT:$<STREQUAL:${ENABLE_OPEN_SRC},True>>:graph/manager/host_mem_allocator.cc>
"graph/manager/graph_mem_allocator.cc"
"graph/manager/graph_caching_allocator.cc"
"model/ge_model.cc"
@@ -525,7 +525,7 @@ set(INFER_SRC_LIST
"graph/passes/for_pass.cc"
"graph/passes/enter_pass.cc"
"graph/passes/assign_pass.cc"
$<$<STREQUAL:${ENABLE_OPEN_SRC},FALSE>:graph/passes/inplace_support_check_pass.cc>
$<$<NOT:$<STREQUAL:${ENABLE_OPEN_SRC},True>>:graph/passes/inplace_support_check_pass.cc>
"graph/passes/addn_pass.cc"
"graph/passes/common_subexpression_elimination_pass.cc"
"graph/passes/remove_same_const_pass.cc"
@@ -624,6 +624,7 @@ target_compile_definitions(ge_runner PRIVATE
FMK_SUPPORT_DUMP
DAVINCI_CLOUD
google=ascend_private
$<$<STREQUAL:${ENABLE_OPEN_SRC},True>:ONLY_COMPILE_OPEN_SRC>
)

target_compile_options(ge_runner PRIVATE
@@ -691,6 +692,7 @@ target_compile_definitions(ge_compiler PRIVATE
FMK_HOST_INFER
COMPILE_OMG_PACKAGE
google=ascend_private
$<$<STREQUAL:${ENABLE_OPEN_SRC},True>:ONLY_COMPILE_OPEN_SRC>
)

target_compile_options(ge_compiler PRIVATE


+ 3
- 1
ge/executor/CMakeLists.txt View File

@@ -28,7 +28,7 @@ set(SRC_LIST
"../graph/manager/trans_var_data_utils.cc"
"../graph/manager/util/debug.cc"
"../graph/manager/rdma_pool_allocator.cc"
$<$<STREQUAL:${ENABLE_OPEN_SRC},FALSE>:../graph/manager/host_mem_allocator.cc>
$<$<NOT:$<STREQUAL:${ENABLE_OPEN_SRC},True>>:../graph/manager/host_mem_allocator.cc>
"../hybrid/node_executor/aicpu/aicpu_ext_info.cc"
"../model/ge_model.cc"
"../model/ge_root_model.cc"
@@ -175,6 +175,7 @@ target_compile_definitions(ge_executor PRIVATE
$<IF:$<STREQUAL:${TARGET_SYSTEM_NAME},Windows>,OS_TYPE=WIN,OS_TYPE=0>
$<$<STREQUAL:${TARGET_SYSTEM_NAME},Windows>:SECUREC_USING_STD_SECURE_LIB=0 NOMINMAX>
LOG_CPP
$<$<STREQUAL:${ENABLE_OPEN_SRC},True>:ONLY_COMPILE_OPEN_SRC>
)

target_include_directories(ge_executor PRIVATE
@@ -217,6 +218,7 @@ target_compile_definitions(ge_executor_shared PRIVATE
PROTOBUF_INLINE_NOT_IN_HEADERS=0
DAVINCI_SUPPORT_PROFILING
google=ascend_private
$<$<STREQUAL:${ENABLE_OPEN_SRC},True>:ONLY_COMPILE_OPEN_SRC>
)

target_include_directories(ge_executor_shared PRIVATE


+ 1
- 1
ge/ge_local_engine/engine/host_cpu_engine.cc View File

@@ -26,7 +26,7 @@
#include "common/math/math_util.h"

namespace {
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
#define CREATE_OUTPUT_CASE(DTYPE, TYPE) \
case (DTYPE): { \
GeTensorPtr ge_tensor = nullptr; \


+ 3
- 3
ge/graph/manager/graph_manager.cc View File

@@ -38,7 +38,7 @@
#include "graph/partition/stage_partition.h"
#include "graph/passes/addn_pass.h"
#include "graph/passes/bitcast_pass.h"
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
#include "graph/passes/assign_pass.h"
#include "graph/passes/inplace_support_check_pass.h"
#endif
@@ -2241,7 +2241,7 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) {
ReshapeRemovePass reshape_remove_pass;
CondRemovePass condition_remove_pass;
BitcastPass bitcast_pass;
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
AssignPass assign_pass;
InplaceSupportCheckPass inplace_support_check_pass;
#endif
@@ -2249,7 +2249,7 @@ Status GraphManager::OptimizeStage2(ge::ComputeGraphPtr &compute_graph) {
names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass);
names_to_passes.emplace_back("CondRemovePass", &condition_remove_pass);
names_to_passes.emplace_back("BitcastPass", &bitcast_pass);
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
if (GetContext().GetHostExecFlag()) {
names_to_passes.emplace_back("AssignPass", &assign_pass);
names_to_passes.emplace_back("InplaceSupportCheckPass", &inplace_support_check_pass);


+ 4
- 4
ge/graph/manager/graph_mem_allocator.cc View File

@@ -19,7 +19,7 @@
#include <string>
#include "graph/manager/graph_caching_allocator.h"
#include "graph/manager/rdma_pool_allocator.h"
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
#include "graph/manager/host_mem_allocator.h"
#endif
namespace ge {
@@ -192,7 +192,7 @@ Status MemManager::Initialize(const std::vector<rtMemType_t> &memory_type) {
GELOGE(ge::INTERNAL_ERROR, "Create RdmaAllocator failed.");
return ge::INTERNAL_ERROR;
}
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
if (InitAllocator(memory_type, host_allocator_map_) != SUCCESS) {
GELOGE(ge::INTERNAL_ERROR, "Create HostMemAllocator failed.");
return ge::INTERNAL_ERROR;
@@ -219,7 +219,7 @@ void MemManager::Finalize() noexcept {
// caching and rdma allocator use memory allocator, so finalize them first
FinalizeAllocatorMap(caching_allocator_map_);
FinalizeAllocatorMap(rdma_allocator_map_);
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
FinalizeAllocatorMap(host_allocator_map_);
#endif
FinalizeAllocatorMap(memory_allocator_map_);
@@ -250,7 +250,7 @@ CachingAllocator &MemManager::CachingInstance(rtMemType_t memory_type) {
RdmaPoolAllocator &MemManager::RdmaPoolInstance(rtMemType_t memory_type) {
return Instance().GetAllocator(memory_type, rdma_allocator_map_);
}
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
HostMemAllocator &MemManager::HostMemInstance(rtMemType_t memory_type) {
return Instance().GetAllocator(memory_type, host_allocator_map_);
}


+ 3
- 3
ge/graph/manager/graph_mem_allocator.h View File

@@ -139,7 +139,7 @@ class MemoryAllocator {
using MemoryAllocatorPtr = std::shared_ptr<MemoryAllocator>;
class CachingAllocator;
class RdmaPoolAllocator;
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
class HostMemAllocator;
#endif
class MemManager {
@@ -150,7 +150,7 @@ class MemManager {
static MemoryAllocator *Instance(rtMemType_t memory_type);
CachingAllocator &CachingInstance(rtMemType_t memory_type);
RdmaPoolAllocator &RdmaPoolInstance(rtMemType_t memory_type);
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
HostMemAllocator &HostMemInstance(rtMemType_t memory_type);
#endif
MemManager(const MemManager &) = delete;
@@ -240,7 +240,7 @@ class MemManager {
std::map<rtMemType_t, MemoryAllocator *> memory_allocator_map_;
std::map<rtMemType_t, CachingAllocator *> caching_allocator_map_;
std::map<rtMemType_t, RdmaPoolAllocator *> rdma_allocator_map_;
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
std::map<rtMemType_t, HostMemAllocator *> host_allocator_map_;
#endif
std::recursive_mutex allocator_mutex_;


+ 2
- 2
ge/graph/manager/host_mem_manager.cc View File

@@ -43,7 +43,7 @@ Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) {
return GE_GRAPH_MEMORY_ALLOC_FAILED;
}
mem_info.fd = output_para.fd;
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
mem_info.host_aligned_ptr = AlignedPtr::BuildAlignedPtr(mem_info.mem_size,
[&output_para](std::unique_ptr<uint8_t[], deleter> &ptr) {
GELOGD("set aligned_ptr, addr=%p", output_para.ptr);
@@ -62,7 +62,7 @@ Status SharedMemAllocator::Allocate(SharedMemInfo &mem_info) {

Status SharedMemAllocator::DeAllocate(SharedMemInfo &mem_info) {
GELOGD("SharedMemAllocator::DeAllocate");
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
rtFreeHostSharedMemoryIn free_para = {mem_info.shm_name.c_str(), mem_info.mem_size, mem_info.fd,
mem_info.host_aligned_ptr->MutableGet(), mem_info.device_address};
#else


+ 1
- 1
ge/graph/manager/host_mem_manager.h View File

@@ -42,7 +42,7 @@ struct SharedMemInfo {
uint64_t mem_size = 0;
int fd = 0;
uint8_t *device_address = nullptr;
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
std::shared_ptr<AlignedPtr> host_aligned_ptr = nullptr;
#else
uint8_t *host_address = nullptr;


+ 1
- 1
ge/graph/passes/assign_pass.cc View File

@@ -26,7 +26,7 @@ const int32_t kAssignValueInputIndex = 1;
}

namespace ge {
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
Status AssignPass::Run(NodePtr &node) {
GELOGD("AssignPass running");



+ 1
- 1
ge/graph/passes/assign_pass.h View File

@@ -25,7 +25,7 @@ class AssignPass : public BaseNodePass {
Status Run(NodePtr &node) override;

private:
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
///
/// @brief Optimize for assign_node
/// @param [in] assign_node


+ 2
- 2
ge/graph/passes/constant_fuse_same_pass.cc View File

@@ -115,7 +115,7 @@ void ConstantFuseSamePass::GetFuseConstNodes(ComputeGraphPtr &graph,
TypeUtils::DataTypeToSerialString(data_type).c_str());
continue;
}
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
if ((type_size != 0) && (weight->MutableData().GetAlignedPtr() == nullptr)) {
GELOGW("aligned_ptr is null while size is not 0");
continue;
@@ -125,7 +125,7 @@ void ConstantFuseSamePass::GetFuseConstNodes(ComputeGraphPtr &graph,

SameConstKey map_key;
map_key.data_size = type_size;
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
map_key.aligned_ptr = weight->MutableData().GetAlignedPtr();
#else
map_key.data = weight->GetData().GetData();


+ 3
- 3
ge/graph/passes/constant_fuse_same_pass.h View File

@@ -21,7 +21,7 @@
#include <set>
#include <utility>
#include <vector>
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
#include "graph/aligned_ptr.h"
#endif
#include "graph/types.h"
@@ -30,7 +30,7 @@
namespace ge {
struct SameConstKey {
int data_size;
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
std::shared_ptr<AlignedPtr> aligned_ptr;
#else
const uint8_t *data;
@@ -44,7 +44,7 @@ struct SameConstKey {
if (data_size != key.data_size) {
return data_size < key.data_size;
}
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
if (data_size != 0) {
int ret = memcmp(aligned_ptr->Get(), key.aligned_ptr->Get(), data_size);
if (ret != 0) {


+ 3
- 3
ge/graph/preprocess/graph_preprocess.cc View File

@@ -37,7 +37,7 @@
#include "graph/passes/addn_pass.h"
#include "graph/passes/aicpu_constant_folding_pass.h"
#include "graph/passes/assert_pass.h"
#if (ENABLE_OPEN_SRC == True)
#ifdef ONLY_COMPILE_OPEN_SRC
#include "graph/passes/assign_pass.h"
#endif
#include "graph/passes/common_subexpression_elimination_pass.h"
@@ -1700,7 +1700,7 @@ Status GraphPrepare::PrepareOptimize() {
VarIsInitializedOpPass var_is_initialized_pass;
ParallelConcatStartOpPass parallel_concat_start_op_pass;
IdentityPass identity_pass(false);
#if (ENABLE_OPEN_SRC == True)
#ifdef ONLY_COMPILE_OPEN_SRC
AssignPass assign_pass;
#endif
SnapshotPass snapshot_pass;
@@ -1717,7 +1717,7 @@ Status GraphPrepare::PrepareOptimize() {
names_to_passes.emplace_back("VarIsInitializedOpPass", &var_is_initialized_pass);
names_to_passes.emplace_back("ParallelConcatStartOpPass", &parallel_concat_start_op_pass);
names_to_passes.emplace_back("IdentityPass", &identity_pass);
#if (ENABLE_OPEN_SRC == True)
#ifdef ONLY_COMPILE_OPEN_SRC
if (GetContext().GetHostExecFlag()) {
names_to_passes.emplace_back("AssignPass", &assign_pass);
}


+ 3
- 3
ge/hybrid/common/npu_memory_allocator.cc View File

@@ -20,7 +20,7 @@
#include "graph/manager/graph_caching_allocator.h"
#include "graph/manager/graph_mem_allocator.h"
#include "graph/manager/rdma_pool_allocator.h"
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
#include "graph/manager/host_mem_allocator.h"
#endif

@@ -67,7 +67,7 @@ void *NpuMemoryAllocator::Allocate(std::size_t size, AllocationAttr *attr) {
if (mem_type == RDMA_HBM) {
buffer = MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Malloc(allocate_size, device_id_);
} else if (mem_type == HOST_DDR) {
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
buffer = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(allocate_size);
#else
buffer = malloc(allocate_size);
@@ -108,7 +108,7 @@ void NpuMemoryAllocator::Deallocate(void *data, MemStorageType mem_type) {
if (mem_type == RDMA_HBM) {
MemManager::Instance().RdmaPoolInstance(RT_MEMORY_HBM).Free(reinterpret_cast<uint8_t *>(data), device_id_);
} else if (mem_type == HOST_DDR) {
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Free(data);
#else
free(data);


+ 3
- 3
ge/hybrid/model/hybrid_model_builder.cc View File

@@ -24,7 +24,7 @@
#include "graph/manager/graph_var_manager.h"
#include "graph/manager/host_mem_manager.h"
#include "graph/manager/trans_var_data_utils.h"
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
#include "graph/manager/graph_mem_allocator.h"
#include "graph/manager/host_mem_allocator.h"
#endif
@@ -853,7 +853,7 @@ Status HybridModelBuilder::InitConstantOps() {

std::unique_ptr<TensorValue> var_tensor;
if (GetContext().GetHostExecFlag()) {
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
GE_CHECK_NOTNULL(ge_tensor);
// Address for eigen kernel should be aligned with 16 bytes
// Tensors return by api GetWeights share data with proto, whose addr is not confirmed to be aligned
@@ -925,7 +925,7 @@ Status HybridModelBuilder::InitVariableTensors() {
GELOGE(GE_GRAPH_MALLOC_FAILED, "Host variable [%s] malloc failed.", it.first.c_str());
return GE_GRAPH_MALLOC_FAILED;
}
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
if (MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).Malloc(mem_info.host_aligned_ptr,
tensor_size) == nullptr) {
GELOGE(MEMALLOC_FAILED, "Malloc host memory for an existed GeTensor failed.");


+ 3
- 3
ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc View File

@@ -18,7 +18,7 @@
#include "hybrid/node_executor/host_cpu/kernel_factory.h"
#include "graph/passes/folding_pass.h"
#include "hybrid/model/hybrid_model.h"
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
#include "graph/manager/graph_mem_allocator.h"
#include "graph/manager/host_mem_allocator.h"
#endif
@@ -54,7 +54,7 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) {
auto input_desc_ptr = context.GetInputDesc(i);
GE_CHECK_NOTNULL(input_desc_ptr);
const auto &input_desc = *input_desc_ptr;
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
auto tensor = context.GetInput(i);
GE_CHECK_NOTNULL(tensor);
auto item = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).GetAlignedPtr(tensor->GetData());
@@ -84,7 +84,7 @@ Status CpuKernelNodeTask::Execute(TaskContext &context) {
}
auto tensor = context.GetOutput(i);
GE_CHECK_NOTNULL(tensor);
#if (ENABLE_OPEN_SRC != True)
#ifndef ONLY_COMPILE_OPEN_SRC
auto item = MemManager::Instance().HostMemInstance(RT_MEMORY_HBM).GetAlignedPtr(tensor->GetData());
GE_CHECK_NOTNULL(item.second);
auto out_tensor = MakeShared<GeTensor>(output_desc, item.second, item.first);


Loading…
Cancel
Save