From 082f9195d4f85e0194c00440322d6e13678374fb Mon Sep 17 00:00:00 2001
From: wangxiaotian22 <wangxiaotian4@huawei.com>
Date: Thu, 5 Nov 2020 17:01:53 +0800
Subject: [PATCH] 1.if broadcast only one input or output, skip continuous mem
 reassign 2.if broadcast input more than one, and from variable, add memcpy
 node between them. delete move variable to broadcast input in davinci model
 run

---
 ge/graph/build/memory/block_mem_assigner.cc      | 19 ++++++++--
 ge/graph/load/new_model_manager/davinci_model.cc |  6 ----
 ge/graph/passes/hccl_memcpy_pass.cc              | 46 +++++++++++++++++++++++-
 ge/graph/passes/hccl_memcpy_pass.h               |  2 ++
 4 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/ge/graph/build/memory/block_mem_assigner.cc b/ge/graph/build/memory/block_mem_assigner.cc
index 2d30c57e..a406d384 100755
--- a/ge/graph/build/memory/block_mem_assigner.cc
+++ b/ge/graph/build/memory/block_mem_assigner.cc
@@ -425,6 +425,13 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) {
       atomic_addr_clean_id_ = node_op_desc->GetId();
     }
 
+    // if input size just one, no need to reassign continuous memory
+    bool is_input_continuous = false;
+    (void)ge::AttrUtils::GetBool(node_op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
+    if (is_input_continuous && (node_op_desc->GetInputSize() <= 1)) {
+      (void)ge::AttrUtils::SetBool(node_op_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, true);
+    }
+
     for (auto &out_anchor : n->GetAllOutDataAnchors()) {
       GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx());
       bool reuse_input = false;
@@ -928,6 +935,13 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "input node is null.");
   auto node_op_desc = n->GetOpDesc();
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null.");
+  
+  // if output size just one, no need to reassign continuous memory
+  if (node_op_desc->GetOutputsSize() == 1) {
+    zero_memory_list_.emplace_back(n, kOutput, 0);
+    return nullptr;
+  }
+
   MemoryBlock *block = nullptr;
   int64_t total_size = 0;
   int64_t memory_type = RT_MEMORY_HBM;
@@ -1746,9 +1760,8 @@ Status BlockMemAssigner::Assign() {
 
 bool BlockMemAssigner::CheckIsZeroMemNodeType(const string &node_type) const {
   return (node_type == VARIABLE) || (node_type == CONSTANT) || (node_type == MULTISHAPE) ||
-         (node_type == HCOMBROADCAST) || (node_type == CONSTANTOP) ||
-         (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) || (node_type == ASSIGN) || (node_type == HVDWAIT) ||
-         (node_type == HVDCALLBACKBROADCAST);
+         (node_type == CONSTANTOP) || (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) ||
+         (node_type == ASSIGN) || (node_type == HVDWAIT);
 }
 
 bool BlockMemAssigner::GetWorkSpaceMemoryType(const NodePtr &node, size_t index, int64_t &memory_type) {
diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc
index cb37182c..05b4cd10 100755
--- a/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/ge/graph/load/new_model_manager/davinci_model.cc
@@ -1993,12 +1993,6 @@ Status DavinciModel::SyncVarData() {
                            RT_MEMCPY_HOST_TO_DEVICE));
   }
 
-  for (auto op_desc : variable_op_list_) {
-    ret =
-        VarManager::Instance(session_id_)->SyncVarData(runtime_param_.graph_id, op_desc->GetName(), op_desc, mem_base_);
-    GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret failed, model id:%u, op name:%s.", model_id_,
-                     op_desc->GetName().c_str());
-  }
   return ret;
 }
 
diff --git a/ge/graph/passes/hccl_memcpy_pass.cc b/ge/graph/passes/hccl_memcpy_pass.cc
index 21747f42..2c34c38e 100755
--- a/ge/graph/passes/hccl_memcpy_pass.cc
+++ b/ge/graph/passes/hccl_memcpy_pass.cc
@@ -37,6 +37,12 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) {
     auto op_desc = node->GetOpDesc();
     GE_IF_BOOL_EXEC(op_desc == nullptr, continue);
 
+    Status ret = ProcessBroadcastMemcpy(graph, node);
+    if (ret != SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "failed ProcessBroadcastMemcpy.");
+      return ret;
+    }
+
     bool node_input_mutable = false;
     if (!AttrUtils::HasAttr(op_desc, kInputMutable)) {
       continue;
@@ -61,7 +67,7 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) {
         // Memcpyasync needs to be inserted between constant (/data) and hcomallreduce to avoid constant being cleared.
         NodePtr src_node = src_out_anchor->GetOwnerNode();
         std::string src_type = src_node->GetType();
-        bool check_src_type = (src_type == CONSTANTOP) || (src_type == DATA) || (src_type == CONSTANT);
+        bool check_src_type = (src_type == CONSTANTOP) || (src_type == VARIABLE) || (src_type == DATA) || (src_type == CONSTANT);
         if (check_src_type) {
           Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor);
           if (ret != SUCCESS) {
@@ -82,6 +88,44 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) {
   return SUCCESS;
 }
 
+// If broadcast input size is bigger than 1, and input from variable,
+// cause by broadcast input memory should be continuous,
+// another featuremap mem will be allocated for broadcast input.
+// In this condition, move data from variable mem to broadcast input featuremap mem will be executed each step.
+// In order to avoid move action out of model, use memcpy node instead of move action code.
+Status HcclMemcpyPass::ProcessBroadcastMemcpy(const ComputeGraphPtr &graph, const NodePtr node) {
+  auto op_desc = node->GetOpDesc();
+  if (op_desc == nullptr) {
+    GELOGE(INTERNAL_ERROR, "node has no op_desc, node_name : %s.", node->GetName().c_str());
+    return INTERNAL_ERROR;
+  }
+
+  if ((node->GetType() == HCOMBROADCAST || node->GetType() == HVDCALLBACKBROADCAST) && op_desc->GetInputSize() > 1) {
+    for (auto &hccl_in_anchor : node->GetAllInDataAnchors()) {
+      if (hccl_in_anchor == nullptr) {
+        continue;
+      }
+      auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor();
+      if (src_out_anchor == nullptr) {
+        GELOGE(INTERNAL_ERROR, "hcom op input has no peer anchor, node_name:%s", node->GetName().c_str());
+        return INTERNAL_ERROR;
+      }
+      
+      NodePtr src_node = src_out_anchor->GetOwnerNode();
+      std::string src_type = src_node->GetType();
+      bool check_src_type = (src_type == CONSTANTOP) || (src_type == VARIABLE) || (src_type == DATA) || (src_type == CONSTANT);
+      if (check_src_type) {
+        Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor);
+        if (ret != SUCCESS) {
+          GELOGE(INTERNAL_ERROR, "Failed to modify the connection.");
+          return ret;
+        }
+      }
+    }
+  }
+  return SUCCESS;
+}
+
 ///
 /// @brief Add MemcpyAsync Node
 /// @param [in] ge::ComputeGraphPtr graph
diff --git a/ge/graph/passes/hccl_memcpy_pass.h b/ge/graph/passes/hccl_memcpy_pass.h
index e73a5483..aaf00779 100755
--- a/ge/graph/passes/hccl_memcpy_pass.h
+++ b/ge/graph/passes/hccl_memcpy_pass.h
@@ -37,6 +37,8 @@ class HcclMemcpyPass : public GraphPass {
   Status ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor,
           const InDataAnchorPtr &hccl_in_anchor);
 
+  Status ProcessBroadcastMemcpy(const ComputeGraphPtr &graph, const NodePtr node);
+
   std::unordered_map<std::string, uint32_t> node_num_map_;
 };
 }  // namespace ge