@@ -318,7 +318,11 @@ void MemoryBlock::AddDependLifeBegin(DependStreamLife &total_node_depend_stream_ | |||||
AddDependLife(node, node, stream_id_, depend_stream_life_, total_node_depend_stream_life); | AddDependLife(node, node, stream_id_, depend_stream_life_, total_node_depend_stream_life); | ||||
} | } | ||||
} | } | ||||
depend_stream_life_[stream_id_] = GetLifeBegin(); | |||||
// not same stream can't be reused by life time directly, should be reused by dependence | |||||
if (same_stream_) { | |||||
depend_stream_life_[stream_id_] = GetLifeBegin(); | |||||
} | |||||
} | } | ||||
size_t MemoryBlock::GetLifeEnd() { | size_t MemoryBlock::GetLifeEnd() { | ||||
@@ -415,6 +419,15 @@ BlockMemAssigner::~BlockMemAssigner() { | |||||
} | } | ||||
} | } | ||||
void BlockMemAssigner::MarkContinuousAllocedForOneInput(OpDescPtr &node_op_desc) { | |||||
// if input size just one, no need to reassign continuous memory | |||||
bool is_input_continuous = false; | |||||
(void)ge::AttrUtils::GetBool(node_op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); | |||||
if (is_input_continuous && (node_op_desc->GetInputsSize() <= 1)) { | |||||
(void)ge::AttrUtils::SetBool(node_op_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, true); | |||||
} | |||||
} | |||||
void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) { | void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) { | ||||
vector<int64_t> temp; | vector<int64_t> temp; | ||||
for (const NodePtr &n : compute_graph_->GetAllNodes()) { | for (const NodePtr &n : compute_graph_->GetAllNodes()) { | ||||
@@ -425,6 +438,8 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) { | |||||
atomic_addr_clean_id_ = node_op_desc->GetId(); | atomic_addr_clean_id_ = node_op_desc->GetId(); | ||||
} | } | ||||
MarkContinuousAllocedForOneInput(node_op_desc); | |||||
for (auto &out_anchor : n->GetAllOutDataAnchors()) { | for (auto &out_anchor : n->GetAllOutDataAnchors()) { | ||||
GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx()); | GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx()); | ||||
bool reuse_input = false; | bool reuse_input = false; | ||||
@@ -815,14 +830,21 @@ bool BlockMemAssigner::IsContinuousOutput(const NodePtr &n) { | |||||
return false; | return false; | ||||
} | } | ||||
// Get the continuous output type of the node, default is false | |||||
bool is_output_continuous = false; | |||||
auto node_desc = n->GetOpDesc(); | auto node_desc = n->GetOpDesc(); | ||||
if (node_desc == nullptr) { | if (node_desc == nullptr) { | ||||
GELOGE(FAILED, "Node[%s] nodedesc is null.", n->GetName().c_str()); | GELOGE(FAILED, "Node[%s] nodedesc is null.", n->GetName().c_str()); | ||||
return false; | return false; | ||||
} | } | ||||
// if output size just one, no need to reassign continuous memory | |||||
if (node_desc->GetOutputsSize() == 1) { | |||||
GELOGI("op %s output size is one, no need to continuous process.", n->GetName().c_str()); | |||||
return false; | |||||
} | |||||
// Get the continuous output type of the node, default is false | |||||
bool is_output_continuous = false; | |||||
// If GetBool fail, is_output_continuous is false. | // If GetBool fail, is_output_continuous is false. | ||||
(void)ge::AttrUtils::GetBool(node_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_output_continuous); | (void)ge::AttrUtils::GetBool(node_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_output_continuous); | ||||
if (is_output_continuous) { | if (is_output_continuous) { | ||||
@@ -928,6 +950,7 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec | |||||
GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "input node is null."); | GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "input node is null."); | ||||
auto node_op_desc = n->GetOpDesc(); | auto node_op_desc = n->GetOpDesc(); | ||||
GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null."); | GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null."); | ||||
MemoryBlock *block = nullptr; | MemoryBlock *block = nullptr; | ||||
int64_t total_size = 0; | int64_t total_size = 0; | ||||
int64_t memory_type = RT_MEMORY_HBM; | int64_t memory_type = RT_MEMORY_HBM; | ||||
@@ -1111,15 +1134,21 @@ bool IsKnownSubgraphData(const NodePtr &node) { | |||||
return node->GetOpDesc()->HasAttr(ATTR_NAME_PARENT_NODE_INDEX); | return node->GetOpDesc()->HasAttr(ATTR_NAME_PARENT_NODE_INDEX); | ||||
} | } | ||||
void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory) { | |||||
void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory, | |||||
bool same_stream) { | |||||
GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null."); | GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null."); | ||||
GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory"); | GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory"); | ||||
GE_CHK_TRUE_EXEC_INFO(!to_release->reuse_mem_, return, "doesn't reuse memory"); | GE_CHK_TRUE_EXEC_INFO(!to_release->reuse_mem_, return, "doesn't reuse memory"); | ||||
--to_release->ref_count_; | --to_release->ref_count_; | ||||
if (!same_stream) { | |||||
to_release->same_stream_ = false; | |||||
} | |||||
if (to_release->ref_count_ == 0) { | if (to_release->ref_count_ == 0) { | ||||
to_release->SetLifeTimeEnd(life_time_); | to_release->SetLifeTimeEnd(life_time_); | ||||
reusable_memory.emplace_back(to_release); | |||||
AddReusableBlockCount(*to_release, reusable_block_counts_); | |||||
if (to_release->same_stream_) { | |||||
reusable_memory.emplace_back(to_release); | |||||
AddReusableBlockCount(*to_release, reusable_block_counts_); | |||||
} | |||||
} | } | ||||
} | } | ||||
@@ -1159,10 +1188,9 @@ void BlockMemAssigner::ReleaseInputNodeOutMemory(const unordered_map<string, vec | |||||
node_type_indexs.back().node->GetName().c_str()); | node_type_indexs.back().node->GetName().c_str()); | ||||
if ((node_type_indexs.back().node == in_anchor->GetPeerOutAnchor()->GetOwnerNode()) && | if ((node_type_indexs.back().node == in_anchor->GetPeerOutAnchor()->GetOwnerNode()) && | ||||
(node_type_indexs.back().index == static_cast<uint32_t>(in_anchor->GetPeerOutAnchor()->GetIdx())) && | |||||
(node->GetOpDesc()->GetStreamId() == block->stream_id_)) { | |||||
ReleaseMemory(block, reusable_memory); | |||||
if (block->ref_count_ == 0) { | |||||
(node_type_indexs.back().index == static_cast<uint32_t>(in_anchor->GetPeerOutAnchor()->GetIdx()))) { | |||||
ReleaseMemory(block, reusable_memory, (node->GetOpDesc()->GetStreamId() == block->stream_id_)); | |||||
if (block->ref_count_ == 0 && block->same_stream_) { | |||||
SetLastUsedInputMemAttr(node, in_anchor->GetIdx()); | SetLastUsedInputMemAttr(node, in_anchor->GetIdx()); | ||||
} | } | ||||
} | } | ||||
@@ -1682,10 +1710,10 @@ void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block, | |||||
op_desc->SetWorkspace(workspace_list); | op_desc->SetWorkspace(workspace_list); | ||||
} | } | ||||
GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu]" | GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu]" | ||||
" noalignsize[%zu] life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d] isref[%d].", graph_name.c_str(), | |||||
" noalignsize[%zu] life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d].", graph_name.c_str(), | |||||
op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(), | op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(), | ||||
block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block, block->reuse_mem_, | block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block, block->reuse_mem_, | ||||
block->continuous_block_, block->deleted_block_, node_type.ref_input); | |||||
block->continuous_block_, block->deleted_block_, block->same_stream_, node_type.ref_input); | |||||
} | } | ||||
void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) { | void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) { | ||||
@@ -1746,9 +1774,8 @@ Status BlockMemAssigner::Assign() { | |||||
bool BlockMemAssigner::CheckIsZeroMemNodeType(const string &node_type) const { | bool BlockMemAssigner::CheckIsZeroMemNodeType(const string &node_type) const { | ||||
return (node_type == VARIABLE) || (node_type == CONSTANT) || (node_type == MULTISHAPE) || | return (node_type == VARIABLE) || (node_type == CONSTANT) || (node_type == MULTISHAPE) || | ||||
(node_type == HCOMBROADCAST) || (node_type == CONSTANTOP) || | |||||
(node_type == ASSIGNADD) || (node_type == ASSIGNSUB) || (node_type == ASSIGN) || (node_type == HVDWAIT) || | |||||
(node_type == HVDCALLBACKBROADCAST); | |||||
(node_type == CONSTANTOP) || (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) || | |||||
(node_type == ASSIGN) || (node_type == HVDWAIT); | |||||
} | } | ||||
bool BlockMemAssigner::GetWorkSpaceMemoryType(const NodePtr &node, size_t index, int64_t &memory_type) { | bool BlockMemAssigner::GetWorkSpaceMemoryType(const NodePtr &node, size_t index, int64_t &memory_type) { | ||||
@@ -65,6 +65,7 @@ class MemoryBlock { | |||||
stream_id_(stream_id), | stream_id_(stream_id), | ||||
deleted_block_(false), | deleted_block_(false), | ||||
reuse_mem_(reuse_mem), | reuse_mem_(reuse_mem), | ||||
same_stream_(true), | |||||
input_index_(0), | input_index_(0), | ||||
continuous_block_(false), | continuous_block_(false), | ||||
first_continuous_block_(false), | first_continuous_block_(false), | ||||
@@ -142,6 +143,7 @@ class MemoryBlock { | |||||
int64_t stream_id_; | int64_t stream_id_; | ||||
bool deleted_block_; | bool deleted_block_; | ||||
bool reuse_mem_; | bool reuse_mem_; | ||||
bool same_stream_; | |||||
uint32_t input_index_; | uint32_t input_index_; | ||||
bool continuous_block_; | bool continuous_block_; | ||||
bool first_continuous_block_; | bool first_continuous_block_; | ||||
@@ -353,7 +355,7 @@ class BlockMemAssigner : public MemAssigner { | |||||
/// @return void | /// @return void | ||||
/// @author | /// @author | ||||
/// | /// | ||||
void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory); | |||||
void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory, bool same_stream = true); | |||||
/// | /// | ||||
/// @ingroup GE | /// @ingroup GE | ||||
@@ -409,6 +411,8 @@ class BlockMemAssigner : public MemAssigner { | |||||
MemoryBlock *ApplyContinuousMemory(const NodePtr &n, const vector<int64_t> &ranges, const bool is_op_reuse_mem); | MemoryBlock *ApplyContinuousMemory(const NodePtr &n, const vector<int64_t> &ranges, const bool is_op_reuse_mem); | ||||
void MarkContinuousAllocedForOneInput(OpDescPtr &node_op_desc); | |||||
std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> reusable_blocks_; | std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> reusable_blocks_; | ||||
std::map<std::string, uint64_t> reusable_block_counts_; | std::map<std::string, uint64_t> reusable_block_counts_; | ||||
@@ -1993,12 +1993,6 @@ Status DavinciModel::SyncVarData() { | |||||
RT_MEMCPY_HOST_TO_DEVICE)); | RT_MEMCPY_HOST_TO_DEVICE)); | ||||
} | } | ||||
for (auto op_desc : variable_op_list_) { | |||||
ret = | |||||
VarManager::Instance(session_id_)->SyncVarData(runtime_param_.graph_id, op_desc->GetName(), op_desc, mem_base_); | |||||
GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret failed, model id:%u, op name:%s.", model_id_, | |||||
op_desc->GetName().c_str()); | |||||
} | |||||
return ret; | return ret; | ||||
} | } | ||||
@@ -1997,6 +1997,8 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) { | |||||
new (std::nothrow) TransOpWithoutReshapeFusionPass)) | new (std::nothrow) TransOpWithoutReshapeFusionPass)) | ||||
GE_CHK_STATUS_RET(after_merge_passes.AddPass("OptimizeStage1_1::TransOpBreadthFusionPass", | GE_CHK_STATUS_RET(after_merge_passes.AddPass("OptimizeStage1_1::TransOpBreadthFusionPass", | ||||
new (std::nothrow) TransOpBreadthFusionPass)) | new (std::nothrow) TransOpBreadthFusionPass)) | ||||
GE_CHK_STATUS_RET( | |||||
after_merge_passes.AddPass("OptimizeStage1_1::HcclMemcpyPass", new (std::nothrow) HcclMemcpyPass)); | |||||
GE_TIMESTAMP_START(after_merge_passes); | GE_TIMESTAMP_START(after_merge_passes); | ||||
auto ret = after_merge_passes.Run(compute_graph); | auto ret = after_merge_passes.Run(compute_graph); | ||||
@@ -32,46 +32,152 @@ const char *const kInputMutable = "_input_mutable"; | |||||
} // namespace | } // namespace | ||||
namespace ge { | namespace ge { | ||||
Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { | Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { | ||||
Status ret = SUCCESS; | |||||
GE_IF_BOOL_EXEC(graph == nullptr, GELOGE(PARAM_INVALID, "param [graph] must not be null."); return PARAM_INVALID); | GE_IF_BOOL_EXEC(graph == nullptr, GELOGE(PARAM_INVALID, "param [graph] must not be null."); return PARAM_INVALID); | ||||
for (const auto &node : graph->GetDirectNode()) { | for (const auto &node : graph->GetDirectNode()) { | ||||
auto op_desc = node->GetOpDesc(); | auto op_desc = node->GetOpDesc(); | ||||
GE_IF_BOOL_EXEC(op_desc == nullptr, continue); | |||||
if (op_desc == nullptr) { | |||||
GELOGE(INTERNAL_ERROR, "node has no op_desc, node_name : %s.", node->GetName().c_str()); | |||||
return INTERNAL_ERROR; | |||||
} | |||||
ret = ContinuousInputProcess(graph, node); | |||||
if (ret != SUCCESS) { | |||||
GELOGE(INTERNAL_ERROR, "failed ProcessBroadcastMemcpy, node_name:%s.", node->GetName().c_str()); | |||||
return ret; | |||||
} | |||||
ret = MutableInputProcess(graph, node); | |||||
if (ret != SUCCESS) { | |||||
GELOGE(INTERNAL_ERROR, "failed MutableInputProcess, node_name:%s.", node->GetName().c_str()); | |||||
return ret; | |||||
} | |||||
ret = P2pmemInputProcess(graph, node); | |||||
if (ret != SUCCESS) { | |||||
GELOGE(INTERNAL_ERROR, "failed P2pmemInputProcess, node_name:%s.", node->GetName().c_str()); | |||||
return ret; | |||||
} | |||||
} | |||||
return ret; | |||||
} | |||||
// If node has _input_mutable attr, means input mem may be modified when op execute. | |||||
// In order to avoid to affect another op execute with same input when data modified, | |||||
// need to inset memcpy node between. | |||||
// also works on situation that input is variable or const. | |||||
Status HcclMemcpyPass::MutableInputProcess(const ComputeGraphPtr &graph, const NodePtr node) { | |||||
auto op_desc = node->GetOpDesc(); | |||||
bool node_input_mutable = false; | |||||
if (!AttrUtils::HasAttr(op_desc, kInputMutable)) { | |||||
return SUCCESS; | |||||
} | |||||
if (!AttrUtils::GetBool(op_desc, kInputMutable, node_input_mutable)) { | |||||
GELOGE(INTERNAL_ERROR, "node:%s get attr:_input_mutable failed.", node->GetName().c_str()); | |||||
return FAILED; | |||||
} | |||||
if (!node_input_mutable) { | |||||
return SUCCESS; | |||||
} | |||||
bool node_input_mutable = false; | |||||
if (!AttrUtils::HasAttr(op_desc, kInputMutable)) { | |||||
GELOGI("input mutable hcom op is:%s.", op_desc->GetName().c_str()); | |||||
for (auto &hccl_in_anchor : node->GetAllInDataAnchors()) { | |||||
if (hccl_in_anchor == nullptr) { | |||||
continue; | continue; | ||||
} | } | ||||
auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); | |||||
GE_CHECK_NOTNULL(src_out_anchor); | |||||
GE_IF_BOOL_EXEC(!AttrUtils::GetBool(op_desc, kInputMutable, node_input_mutable), | |||||
GELOGE(INTERNAL_ERROR, "node:%s get attr:_input_mutable failed.", node->GetName().c_str()); return FAILED); | |||||
if (!node_input_mutable) { | |||||
int32_t src_out_anchor_size = src_out_anchor->GetPeerInDataAnchors().size(); | |||||
if (src_out_anchor_size == kAnchorSize) { | |||||
// Identity needs to be inserted between constant (/data) and hcomallreduce to avoid constant being cleared. | |||||
if (IsDataNode(src_out_anchor->GetOwnerNode()->GetType())) { | |||||
Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | |||||
if (ret != SUCCESS) { | |||||
GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | |||||
return ret; | |||||
} | |||||
} | |||||
continue; | continue; | ||||
} | } | ||||
GELOGI("hcom op is:%s.", op_desc->GetName().c_str()); | |||||
Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | |||||
if (ret != SUCCESS) { | |||||
GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | |||||
return ret; | |||||
} | |||||
} | |||||
return SUCCESS; | |||||
} | |||||
// If broadcast input size is bigger than 1, and input from variable, | |||||
// cause by broadcast input memory should be continuous, | |||||
// another featuremap mem will be allocated for broadcast input. | |||||
// In this condition, move data from variable mem to broadcast input featuremap mem will be executed each step. | |||||
// In order to avoid move action out of model, use memcpy node instead of move action code. | |||||
Status HcclMemcpyPass::ContinuousInputProcess(const ComputeGraphPtr &graph, const NodePtr node) { | |||||
auto op_desc = node->GetOpDesc(); | |||||
bool is_input_continuous = false; | |||||
(void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous); | |||||
if (is_input_continuous && op_desc->GetInputsSize() > 1) { | |||||
GELOGI("continuous input op is:%s.", op_desc->GetName().c_str()); | |||||
// if input size bigger than one, insert memcpy between var data for support continous mem alloc | |||||
for (auto &hccl_in_anchor : node->GetAllInDataAnchors()) { | for (auto &hccl_in_anchor : node->GetAllInDataAnchors()) { | ||||
if (hccl_in_anchor == nullptr) { | if (hccl_in_anchor == nullptr) { | ||||
continue; | continue; | ||||
} | } | ||||
auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); | auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); | ||||
GE_CHECK_NOTNULL(src_out_anchor); | |||||
int32_t src_out_anchor_size = src_out_anchor->GetPeerInDataAnchors().size(); | |||||
if (src_out_anchor_size == kAnchorSize) { | |||||
// Memcpyasync needs to be inserted between constant (/data) and hcomallreduce to avoid constant being cleared. | |||||
NodePtr src_node = src_out_anchor->GetOwnerNode(); | |||||
std::string src_type = src_node->GetType(); | |||||
bool check_src_type = (src_type == CONSTANTOP) || (src_type == DATA) || (src_type == CONSTANT); | |||||
if (check_src_type) { | |||||
Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | |||||
if (ret != SUCCESS) { | |||||
GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | |||||
return ret; | |||||
} | |||||
if (src_out_anchor == nullptr) { | |||||
GELOGE(INTERNAL_ERROR, "hcom op input has no peer anchor, node_name:%s", node->GetName().c_str()); | |||||
return INTERNAL_ERROR; | |||||
} | |||||
if (IsDataNode(src_out_anchor->GetOwnerNode()->GetType())) { | |||||
Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | |||||
if (ret != SUCCESS) { | |||||
GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | |||||
return ret; | |||||
} | } | ||||
continue; | |||||
} | } | ||||
} | |||||
} | |||||
return SUCCESS; | |||||
} | |||||
// if input is var type, and node input need p2p mem, then memcpy should be insert between the two | |||||
Status HcclMemcpyPass::P2pmemInputProcess(const ComputeGraphPtr &graph, const NodePtr node) { | |||||
auto op_desc = node->GetOpDesc(); | |||||
vector<int64_t> input_memory_types; | |||||
(void) ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, input_memory_types); | |||||
if (input_memory_types.empty()) { | |||||
return SUCCESS; | |||||
} | |||||
for (uint32_t index = 0; index < input_memory_types.size() && index < op_desc->GetInputsSize(); index++) { | |||||
if (input_memory_types[index] != RT_MEMORY_P2P_DDR) { | |||||
continue; | |||||
} | |||||
GELOGI("p2p input op is:%s.", op_desc->GetName().c_str()); | |||||
auto hccl_in_anchor = node->GetInDataAnchor(index); | |||||
if (hccl_in_anchor == nullptr) { | |||||
continue; | |||||
} | |||||
auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor(); | |||||
if (src_out_anchor == nullptr) { | |||||
GELOGE(INTERNAL_ERROR, "hcom op input has no peer anchor, node_name:%s", node->GetName().c_str()); | |||||
return INTERNAL_ERROR; | |||||
} | |||||
if (IsDataNode(src_out_anchor->GetOwnerNode()->GetType())) { | |||||
Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor); | ||||
if (ret != SUCCESS) { | if (ret != SUCCESS) { | ||||
GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | GELOGE(INTERNAL_ERROR, "Failed to modify the connection."); | ||||
@@ -82,8 +188,12 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) { | |||||
return SUCCESS; | return SUCCESS; | ||||
} | } | ||||
bool HcclMemcpyPass::IsDataNode(const std::string& node_type) { | |||||
return (node_type == CONSTANTOP) || (node_type == VARIABLE) || (node_type == DATA) || (node_type == CONSTANT); | |||||
} | |||||
/// | /// | ||||
/// @brief Add MemcpyAsync Node | |||||
/// @brief Add Identity Node | |||||
/// @param [in] ge::ComputeGraphPtr graph | /// @param [in] ge::ComputeGraphPtr graph | ||||
/// @param [in] ge::OutDataAnchorPtr in_node | /// @param [in] ge::OutDataAnchorPtr in_node | ||||
/// @return ge::NodePtr | /// @return ge::NodePtr | ||||
@@ -101,20 +211,20 @@ NodePtr HcclMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &graph, const O | |||||
node_name = CheckDuplicateName(node_name); | node_name = CheckDuplicateName(node_name); | ||||
OpDescPtr op_desc = MakeShared<OpDesc>(node_name.c_str(), IDENTITY); | OpDescPtr op_desc = MakeShared<OpDesc>(node_name.c_str(), IDENTITY); | ||||
if (op_desc == nullptr) { | if (op_desc == nullptr) { | ||||
GELOGE(INTERNAL_ERROR, "Create identity op: MakeShared op_desc fail."); | |||||
GELOGE(INTERNAL_ERROR, "Create Identity op: MakeShared op_desc fail."); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
GELOGI("Create identity op:%s.", op_desc->GetName().c_str()); | |||||
GELOGI("Create Identity op:%s.", op_desc->GetName().c_str()); | |||||
graphStatus ret = op_desc->AddInputDesc("x", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); | graphStatus ret = op_desc->AddInputDesc("x", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); | ||||
if (ret != GRAPH_SUCCESS) { | if (ret != GRAPH_SUCCESS) { | ||||
GELOGE(INTERNAL_ERROR, "Create identity op: add input desc fail."); | |||||
GELOGE(INTERNAL_ERROR, "Create Identity op: add input desc fail."); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
ret = op_desc->AddOutputDesc("y", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); | ret = op_desc->AddOutputDesc("y", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx())); | ||||
if (ret != GRAPH_SUCCESS) { | if (ret != GRAPH_SUCCESS) { | ||||
GELOGE(INTERNAL_ERROR, "Create identity op: add output desc fail."); | |||||
GELOGE(INTERNAL_ERROR, "Create Identity op: add output desc fail."); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
// because history reason ,this pass can not do work after constant fold so mark it | // because history reason ,this pass can not do work after constant fold so mark it | ||||
@@ -122,7 +232,7 @@ NodePtr HcclMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &graph, const O | |||||
NodePtr memcpy_node = graph->AddNode(op_desc); | NodePtr memcpy_node = graph->AddNode(op_desc); | ||||
if (memcpy_node == nullptr) { | if (memcpy_node == nullptr) { | ||||
GELOGE(INTERNAL_ERROR, "Insert identity node fail."); | |||||
GELOGE(INTERNAL_ERROR, "Insert Identity node fail."); | |||||
return nullptr; | return nullptr; | ||||
} | } | ||||
@@ -155,7 +265,8 @@ std::string HcclMemcpyPass::CheckDuplicateName(const std::string &node_name) { | |||||
/// | /// | ||||
Status HcclMemcpyPass::ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, | Status HcclMemcpyPass::ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, | ||||
const InDataAnchorPtr &hccl_in_anchor) { | const InDataAnchorPtr &hccl_in_anchor) { | ||||
GELOGI("The op %s need insert memcpy async op.", src_out_anchor->GetOwnerNode()->GetName().c_str()); | |||||
GELOGI("Between op %s and op %s need insert memcpy async op.", src_out_anchor->GetOwnerNode()->GetName().c_str(), | |||||
hccl_in_anchor->GetOwnerNode()->GetName().c_str()); | |||||
NodePtr memcpy_node = CreateIdentityNode(graph, src_out_anchor); | NodePtr memcpy_node = CreateIdentityNode(graph, src_out_anchor); | ||||
GE_CHECK_NOTNULL(memcpy_node); | GE_CHECK_NOTNULL(memcpy_node); | ||||
@@ -37,6 +37,14 @@ class HcclMemcpyPass : public GraphPass { | |||||
Status ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, | Status ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor, | ||||
const InDataAnchorPtr &hccl_in_anchor); | const InDataAnchorPtr &hccl_in_anchor); | ||||
Status ContinuousInputProcess(const ComputeGraphPtr &graph, const NodePtr node); | |||||
Status MutableInputProcess(const ComputeGraphPtr &graph, const NodePtr node); | |||||
Status P2pmemInputProcess(const ComputeGraphPtr &graph, const NodePtr node); | |||||
bool IsDataNode(const std::string& node_type); | |||||
std::unordered_map<std::string, uint32_t> node_num_map_; | std::unordered_map<std::string, uint32_t> node_num_map_; | ||||
}; | }; | ||||
} // namespace ge | } // namespace ge | ||||
@@ -60,7 +60,6 @@ | |||||
#include "graph/passes/get_original_format_pass.h" | #include "graph/passes/get_original_format_pass.h" | ||||
#include "graph/passes/guarantee_const_pass.h" | #include "graph/passes/guarantee_const_pass.h" | ||||
#include "graph/passes/hccl_group_pass.h" | #include "graph/passes/hccl_group_pass.h" | ||||
#include "graph/passes/hccl_memcpy_pass.h" | |||||
#include "graph/passes/identity_pass.h" | #include "graph/passes/identity_pass.h" | ||||
#include "graph/passes/infershape_pass.h" | #include "graph/passes/infershape_pass.h" | ||||
#include "graph/passes/iterator_op_pass.h" | #include "graph/passes/iterator_op_pass.h" | ||||
@@ -1693,8 +1692,6 @@ Status GraphPrepare::PrepareOptimize() { | |||||
PassManager graph_pass; | PassManager graph_pass; | ||||
try { | try { | ||||
(void)graph_pass.AddPass("PrepareOptimize::PrunePass", new PrunePass); | (void)graph_pass.AddPass("PrepareOptimize::PrunePass", new PrunePass); | ||||
// todo 临时把hccl的memcpy插入放到图准备,为了防止其多插memcpy | |||||
(void)graph_pass.AddPass("PrepareOptimize::HcclMemcpyPass", new (std::nothrow) HcclMemcpyPass); | |||||
} catch (std::bad_alloc &e) { | } catch (std::bad_alloc &e) { | ||||
GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs."); | GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs."); | ||||
return INTERNAL_ERROR; | return INTERNAL_ERROR; | ||||
@@ -245,6 +245,11 @@ const std::string INPUT_FP16_NODES = "ge.INPUT_NODES_SET_FP16"; | |||||
// 0: close debug; 1: open TBE compiler; 2: open ccec compiler | // 0: close debug; 1: open TBE compiler; 2: open ccec compiler | ||||
const std::string OP_DEBUG_LEVEL = "ge.opDebugLevel"; | const std::string OP_DEBUG_LEVEL = "ge.opDebugLevel"; | ||||
// Configure for fix hcombroadcast format. | |||||
// when config model multi, broadcast format should be fixed | |||||
// 0: data multi; 1: model multi; | |||||
const std::string HCOM_MULTI_MODE = "ge.hcomMultiMode"; | |||||
// Graph run mode | // Graph run mode | ||||
enum GraphRunMode { PREDICTION = 0, TRAIN }; | enum GraphRunMode { PREDICTION = 0, TRAIN }; | ||||