diff --git a/ge/graph/build/memory/block_mem_assigner.cc b/ge/graph/build/memory/block_mem_assigner.cc
index 2d30c57e..efb04b68 100755
--- a/ge/graph/build/memory/block_mem_assigner.cc
+++ b/ge/graph/build/memory/block_mem_assigner.cc
@@ -318,7 +318,11 @@ void MemoryBlock::AddDependLifeBegin(DependStreamLife &total_node_depend_stream_
       AddDependLife(node, node, stream_id_, depend_stream_life_, total_node_depend_stream_life);
     }
   }
-  depend_stream_life_[stream_id_] = GetLifeBegin();
+
+  // not same stream can't be reused by life time directly, should be reused by dependence
+  if (same_stream_) {
+    depend_stream_life_[stream_id_] = GetLifeBegin();
+  }
 }
 
 size_t MemoryBlock::GetLifeEnd() {
@@ -415,6 +419,15 @@ BlockMemAssigner::~BlockMemAssigner() {
   }
 }
 
+void BlockMemAssigner::MarkContinuousAllocedForOneInput(OpDescPtr &node_op_desc) {
+  // if input size just one, no need to reassign continuous memory
+  bool is_input_continuous = false;
+  (void)ge::AttrUtils::GetBool(node_op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
+  if (is_input_continuous && (node_op_desc->GetInputsSize() <= 1)) {
+    (void)ge::AttrUtils::SetBool(node_op_desc, ATTR_NAME_CONTINUOUS_INPUT_ALLOC, true);
+  }
+}
+
 void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) {
   vector<int64_t> temp;
   for (const NodePtr &n : compute_graph_->GetAllNodes()) {
@@ -425,6 +438,8 @@ void BlockMemAssigner::GetOutAndWorkSpaceMem(vector<int64_t> &all_memory_size) {
       atomic_addr_clean_id_ = node_op_desc->GetId();
     }
 
+    MarkContinuousAllocedForOneInput(node_op_desc);
+
     for (auto &out_anchor : n->GetAllOutDataAnchors()) {
       GeTensorDesc output_desc = node_op_desc->GetOutputDesc(out_anchor->GetIdx());
       bool reuse_input = false;
@@ -815,14 +830,21 @@ bool BlockMemAssigner::IsContinuousOutput(const NodePtr &n) {
     return false;
   }
 
-  // Get the continuous output type of the node, default is false
-  bool is_output_continuous = false;
   auto node_desc = n->GetOpDesc();
   if (node_desc == nullptr) {
     GELOGE(FAILED, "Node[%s] nodedesc is null.", n->GetName().c_str());
     return false;
   }
 
+  // if output size just one, no need to reassign continuous memory
+  if (node_desc->GetOutputsSize() == 1) {
+    GELOGI("op %s output size is one, no need to continuous process.", n->GetName().c_str());
+    return false;
+  }
+
+  // Get the continuous output type of the node, default is false
+  bool is_output_continuous = false;
+
   // If GetBool fail, is_output_continuous is false.
   (void)ge::AttrUtils::GetBool(node_desc, ATTR_NAME_CONTINUOUS_OUTPUT, is_output_continuous);
   if (is_output_continuous) {
@@ -928,6 +950,7 @@ MemoryBlock *BlockMemAssigner::ApplyContinuousMemory(const NodePtr &n, const vec
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "input node is null.");
   auto node_op_desc = n->GetOpDesc();
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(node_op_desc == nullptr, return nullptr, "node_op_desc is null.");
+
   MemoryBlock *block = nullptr;
   int64_t total_size = 0;
   int64_t memory_type = RT_MEMORY_HBM;
@@ -1111,15 +1134,21 @@ bool IsKnownSubgraphData(const NodePtr &node) {
   return node->GetOpDesc()->HasAttr(ATTR_NAME_PARENT_NODE_INDEX);
 }
 
-void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory) {
+void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory,
+                                     bool same_stream) {
   GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(to_release == nullptr, return, "Input parameter to_release is null.");
   GE_CHK_TRUE_EXEC_INFO(to_release->ref_count_ <= 0, return, "Release memory");
   GE_CHK_TRUE_EXEC_INFO(!to_release->reuse_mem_, return, "doesn't reuse memory");
   --to_release->ref_count_;
+  if (!same_stream) {
+    to_release->same_stream_ = false;
+  }
   if (to_release->ref_count_ == 0) {
     to_release->SetLifeTimeEnd(life_time_);
-    reusable_memory.emplace_back(to_release);
-    AddReusableBlockCount(*to_release, reusable_block_counts_);
+    if (to_release->same_stream_) {
+      reusable_memory.emplace_back(to_release);
+      AddReusableBlockCount(*to_release, reusable_block_counts_);
+    }
   }
 }
 
@@ -1159,10 +1188,9 @@ void BlockMemAssigner::ReleaseInputNodeOutMemory(const unordered_map<string, vec
              node_type_indexs.back().node->GetName().c_str());
 
       if ((node_type_indexs.back().node == in_anchor->GetPeerOutAnchor()->GetOwnerNode()) &&
-          (node_type_indexs.back().index == static_cast<uint32_t>(in_anchor->GetPeerOutAnchor()->GetIdx())) &&
-          (node->GetOpDesc()->GetStreamId() == block->stream_id_)) {
-        ReleaseMemory(block, reusable_memory);
-        if (block->ref_count_ == 0) {
+          (node_type_indexs.back().index == static_cast<uint32_t>(in_anchor->GetPeerOutAnchor()->GetIdx()))) {
+        ReleaseMemory(block, reusable_memory, (node->GetOpDesc()->GetStreamId() == block->stream_id_));
+        if (block->ref_count_ == 0 && block->same_stream_) {
           SetLastUsedInputMemAttr(node, in_anchor->GetIdx());
         }
       }
@@ -1682,10 +1710,10 @@ void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block,
     op_desc->SetWorkspace(workspace_list);
   }
   GELOGI("[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu]"
-         " noalignsize[%zu] life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d] isref[%d].", graph_name.c_str(),
+         " noalignsize[%zu] life time begin[%zu] life time end[%zu] child[%d:%d:%d:%d:%d] isref[%d].", graph_name.c_str(),
          op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset, op_desc->GetStreamId(),
          block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block, block->reuse_mem_,
-         block->continuous_block_, block->deleted_block_, node_type.ref_input);
+         block->continuous_block_, block->deleted_block_, block->same_stream_, node_type.ref_input);
 }
 
 void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) {
@@ -1746,9 +1774,8 @@ Status BlockMemAssigner::Assign() {
 
 bool BlockMemAssigner::CheckIsZeroMemNodeType(const string &node_type) const {
   return (node_type == VARIABLE) || (node_type == CONSTANT) || (node_type == MULTISHAPE) ||
-         (node_type == HCOMBROADCAST) || (node_type == CONSTANTOP) ||
-         (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) || (node_type == ASSIGN) || (node_type == HVDWAIT) ||
-         (node_type == HVDCALLBACKBROADCAST);
+         (node_type == CONSTANTOP) || (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) ||
+         (node_type == ASSIGN) || (node_type == HVDWAIT);
 }
 
 bool BlockMemAssigner::GetWorkSpaceMemoryType(const NodePtr &node, size_t index, int64_t &memory_type) {
diff --git a/ge/graph/build/memory/block_mem_assigner.h b/ge/graph/build/memory/block_mem_assigner.h
index f3d26c1d..d8a9573f 100755
--- a/ge/graph/build/memory/block_mem_assigner.h
+++ b/ge/graph/build/memory/block_mem_assigner.h
@@ -65,6 +65,7 @@ class MemoryBlock {
         stream_id_(stream_id),
         deleted_block_(false),
         reuse_mem_(reuse_mem),
+        same_stream_(true),
         input_index_(0),
         continuous_block_(false),
         first_continuous_block_(false),
@@ -142,6 +143,7 @@ class MemoryBlock {
   int64_t stream_id_;
   bool deleted_block_;
   bool reuse_mem_;
+  bool same_stream_;
   uint32_t input_index_;
   bool continuous_block_;
   bool first_continuous_block_;
@@ -353,7 +355,7 @@ class BlockMemAssigner : public MemAssigner {
   /// @return void
   /// @author
   ///
-  void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory);
+  void ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock *> &reusable_memory, bool same_stream = true);
 
   ///
   /// @ingroup GE
@@ -409,6 +411,8 @@ class BlockMemAssigner : public MemAssigner {
 
   MemoryBlock *ApplyContinuousMemory(const NodePtr &n, const vector<int64_t> &ranges, const bool is_op_reuse_mem);
 
+  void MarkContinuousAllocedForOneInput(OpDescPtr &node_op_desc);
+
   std::unordered_map<int64_t, std::unordered_map<int64_t, std::vector<MemoryBlock *>>> reusable_blocks_;
 
   std::map<std::string, uint64_t> reusable_block_counts_;
diff --git a/ge/graph/load/new_model_manager/davinci_model.cc b/ge/graph/load/new_model_manager/davinci_model.cc
index cb37182c..05b4cd10 100755
--- a/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/ge/graph/load/new_model_manager/davinci_model.cc
@@ -1993,12 +1993,6 @@ Status DavinciModel::SyncVarData() {
                            RT_MEMCPY_HOST_TO_DEVICE));
   }
 
-  for (auto op_desc : variable_op_list_) {
-    ret =
-        VarManager::Instance(session_id_)->SyncVarData(runtime_param_.graph_id, op_desc->GetName(), op_desc, mem_base_);
-    GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "sync var data ret failed, model id:%u, op name:%s.", model_id_,
-                     op_desc->GetName().c_str());
-  }
   return ret;
 }
 
diff --git a/ge/graph/manager/graph_manager.cc b/ge/graph/manager/graph_manager.cc
index 282cd7a6..b9c56467 100755
--- a/ge/graph/manager/graph_manager.cc
+++ b/ge/graph/manager/graph_manager.cc
@@ -1997,6 +1997,8 @@ Status GraphManager::OptimizeStage1(ge::ComputeGraphPtr &compute_graph) {
                                                new (std::nothrow) TransOpWithoutReshapeFusionPass))
   GE_CHK_STATUS_RET(after_merge_passes.AddPass("OptimizeStage1_1::TransOpBreadthFusionPass",
                                                new (std::nothrow) TransOpBreadthFusionPass))
+  GE_CHK_STATUS_RET(
+      after_merge_passes.AddPass("OptimizeStage1_1::HcclMemcpyPass", new (std::nothrow) HcclMemcpyPass));
 
   GE_TIMESTAMP_START(after_merge_passes);
   auto ret = after_merge_passes.Run(compute_graph);
diff --git a/ge/graph/passes/hccl_memcpy_pass.cc b/ge/graph/passes/hccl_memcpy_pass.cc
index 21747f42..ada5397b 100755
--- a/ge/graph/passes/hccl_memcpy_pass.cc
+++ b/ge/graph/passes/hccl_memcpy_pass.cc
@@ -32,46 +32,152 @@ const char *const kInputMutable = "_input_mutable";
 }  // namespace
 namespace ge {
 Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) {
+  Status ret = SUCCESS;
   GE_IF_BOOL_EXEC(graph == nullptr, GELOGE(PARAM_INVALID, "param [graph] must not be null."); return PARAM_INVALID);
   for (const auto &node : graph->GetDirectNode()) {
     auto op_desc = node->GetOpDesc();
-    GE_IF_BOOL_EXEC(op_desc == nullptr, continue);
+    if (op_desc == nullptr) {
+      GELOGE(INTERNAL_ERROR, "node has no op_desc, node_name : %s.", node->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+
+    ret = ContinuousInputProcess(graph, node);
+    if (ret != SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "failed ProcessBroadcastMemcpy, node_name:%s.", node->GetName().c_str());
+      return ret;
+    }
+
+    ret = MutableInputProcess(graph, node);
+    if (ret != SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "failed MutableInputProcess, node_name:%s.", node->GetName().c_str());
+      return ret;
+    }
+
+    ret = P2pmemInputProcess(graph, node);
+    if (ret != SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "failed P2pmemInputProcess, node_name:%s.", node->GetName().c_str());
+      return ret;
+    }
+
+  }
+  return ret;
+}
+
+// If node has _input_mutable attr, means input mem may be modified when op execute.
+// In order to avoid to affect another op execute with same input when data modified,
+// need to inset memcpy node between.
+// also works on situation that input is variable or const.
+Status HcclMemcpyPass::MutableInputProcess(const ComputeGraphPtr &graph, const NodePtr node) {
+  auto op_desc = node->GetOpDesc();
+
+  bool node_input_mutable = false;
+  if (!AttrUtils::HasAttr(op_desc, kInputMutable)) {
+    return SUCCESS;
+  }
+
+  if (!AttrUtils::GetBool(op_desc, kInputMutable, node_input_mutable)) {
+    GELOGE(INTERNAL_ERROR, "node:%s get attr:_input_mutable failed.", node->GetName().c_str());
+    return FAILED;
+  }
+  if (!node_input_mutable) {
+    return SUCCESS;
+  }
 
-    bool node_input_mutable = false;
-    if (!AttrUtils::HasAttr(op_desc, kInputMutable)) {
+  GELOGI("input mutable hcom op is:%s.", op_desc->GetName().c_str());
+  for (auto &hccl_in_anchor : node->GetAllInDataAnchors()) {
+    if (hccl_in_anchor == nullptr) {
       continue;
     }
+    auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor();
+    GE_CHECK_NOTNULL(src_out_anchor);
 
-    GE_IF_BOOL_EXEC(!AttrUtils::GetBool(op_desc, kInputMutable, node_input_mutable),
-        GELOGE(INTERNAL_ERROR, "node:%s get attr:_input_mutable failed.", node->GetName().c_str()); return FAILED);
-    if (!node_input_mutable) {
+    int32_t src_out_anchor_size = src_out_anchor->GetPeerInDataAnchors().size();
+    if (src_out_anchor_size == kAnchorSize) {
+      // Identity needs to be inserted between constant (/data) and hcomallreduce to avoid constant being cleared.
+      if (IsDataNode(src_out_anchor->GetOwnerNode()->GetType())) {
+        Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor);
+        if (ret != SUCCESS) {
+          GELOGE(INTERNAL_ERROR, "Failed to modify the connection.");
+          return ret;
+        }
+      }
       continue;
     }
 
-    GELOGI("hcom op is:%s.", op_desc->GetName().c_str());
+    Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor);
+    if (ret != SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Failed to modify the connection.");
+      return ret;
+    }
+  }
+  return SUCCESS;
+}
+
+
+// If broadcast input size is bigger than 1, and input from variable,
+// cause by broadcast input memory should be continuous,
+// another featuremap mem will be allocated for broadcast input.
+// In this condition, move data from variable mem to broadcast input featuremap mem will be executed each step.
+// In order to avoid move action out of model, use memcpy node instead of move action code.
+Status HcclMemcpyPass::ContinuousInputProcess(const ComputeGraphPtr &graph, const NodePtr node) {
+  auto op_desc = node->GetOpDesc();
+
+  bool is_input_continuous = false;
+  (void)ge::AttrUtils::GetBool(op_desc, ATTR_NAME_CONTINUOUS_INPUT, is_input_continuous);
+
+  if (is_input_continuous && op_desc->GetInputsSize() > 1) {
+    GELOGI("continuous input op is:%s.", op_desc->GetName().c_str());
+    // if input size bigger than one, insert memcpy between var data for support continous mem alloc
     for (auto &hccl_in_anchor : node->GetAllInDataAnchors()) {
       if (hccl_in_anchor == nullptr) {
         continue;
       }
       auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor();
-      GE_CHECK_NOTNULL(src_out_anchor);
-
-      int32_t src_out_anchor_size = src_out_anchor->GetPeerInDataAnchors().size();
-      if (src_out_anchor_size == kAnchorSize) {
-        // Memcpyasync needs to be inserted between constant (/data) and hcomallreduce to avoid constant being cleared.
-        NodePtr src_node = src_out_anchor->GetOwnerNode();
-        std::string src_type = src_node->GetType();
-        bool check_src_type = (src_type == CONSTANTOP) || (src_type == DATA) || (src_type == CONSTANT);
-        if (check_src_type) {
-          Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor);
-          if (ret != SUCCESS) {
-            GELOGE(INTERNAL_ERROR, "Failed to modify the connection.");
-            return ret;
-          }
+      if (src_out_anchor == nullptr) {
+        GELOGE(INTERNAL_ERROR, "hcom op input has no peer anchor, node_name:%s", node->GetName().c_str());
+        return INTERNAL_ERROR;
+      }
+
+      if (IsDataNode(src_out_anchor->GetOwnerNode()->GetType())) {
+        Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor);
+        if (ret != SUCCESS) {
+          GELOGE(INTERNAL_ERROR, "Failed to modify the connection.");
+          return ret;
         }
-        continue;
       }
+    }
+  }
+  return SUCCESS;
+}
+
+// if input is var type, and node input need p2p mem, then memcpy should be insert between the two
+Status HcclMemcpyPass::P2pmemInputProcess(const ComputeGraphPtr &graph, const NodePtr node) {
+  auto op_desc = node->GetOpDesc();
+
+  vector<int64_t> input_memory_types;
+  (void) ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, input_memory_types);
 
+  if (input_memory_types.empty()) {
+    return SUCCESS;
+  }
+
+  for (uint32_t index = 0; index < input_memory_types.size() && index < op_desc->GetInputsSize(); index++) {
+    if (input_memory_types[index] != RT_MEMORY_P2P_DDR) {
+      continue;
+    }
+
+    GELOGI("p2p input op is:%s.", op_desc->GetName().c_str());
+    auto hccl_in_anchor = node->GetInDataAnchor(index);
+    if (hccl_in_anchor == nullptr) {
+      continue;
+    }
+    auto src_out_anchor = hccl_in_anchor->GetPeerOutAnchor();
+    if (src_out_anchor == nullptr) {
+      GELOGE(INTERNAL_ERROR, "hcom op input has no peer anchor, node_name:%s", node->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+
+    if (IsDataNode(src_out_anchor->GetOwnerNode()->GetType())) {
       Status ret = ModifyEdgeConnection(graph, src_out_anchor, hccl_in_anchor);
       if (ret != SUCCESS) {
         GELOGE(INTERNAL_ERROR, "Failed to modify the connection.");
@@ -82,8 +188,12 @@ Status HcclMemcpyPass::Run(ge::ComputeGraphPtr graph) {
   return SUCCESS;
 }
 
+bool HcclMemcpyPass::IsDataNode(const std::string& node_type) {
+  return (node_type == CONSTANTOP) || (node_type == VARIABLE) || (node_type == DATA) || (node_type == CONSTANT);
+}
+
 ///
-/// @brief Add MemcpyAsync Node
+/// @brief Add Identity Node
 /// @param [in] ge::ComputeGraphPtr graph
 /// @param [in] ge::OutDataAnchorPtr in_node
 /// @return ge::NodePtr
@@ -101,20 +211,20 @@ NodePtr HcclMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &graph, const O
   node_name = CheckDuplicateName(node_name);
   OpDescPtr op_desc = MakeShared<OpDesc>(node_name.c_str(), IDENTITY);
   if (op_desc == nullptr) {
-    GELOGE(INTERNAL_ERROR, "Create identity op: MakeShared op_desc fail.");
+    GELOGE(INTERNAL_ERROR, "Create Identity op: MakeShared op_desc fail.");
     return nullptr;
   }
-  GELOGI("Create identity op:%s.", op_desc->GetName().c_str());
+  GELOGI("Create Identity op:%s.", op_desc->GetName().c_str());
 
   graphStatus ret = op_desc->AddInputDesc("x", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx()));
   if (ret != GRAPH_SUCCESS) {
-    GELOGE(INTERNAL_ERROR, "Create identity op: add input desc fail.");
+    GELOGE(INTERNAL_ERROR, "Create Identity op: add input desc fail.");
     return nullptr;
   }
 
   ret = op_desc->AddOutputDesc("y", pre_op_desc->GetOutputDesc(out_data_anchor->GetIdx()));
   if (ret != GRAPH_SUCCESS) {
-    GELOGE(INTERNAL_ERROR, "Create identity op: add output desc fail.");
+    GELOGE(INTERNAL_ERROR, "Create Identity op: add output desc fail.");
     return nullptr;
   }
   // because history reason ,this pass can not do work after constant fold so mark it
@@ -122,7 +232,7 @@ NodePtr HcclMemcpyPass::CreateIdentityNode(const ComputeGraphPtr &graph, const O
 
   NodePtr memcpy_node = graph->AddNode(op_desc);
   if (memcpy_node == nullptr) {
-    GELOGE(INTERNAL_ERROR, "Insert identity node fail.");
+    GELOGE(INTERNAL_ERROR, "Insert Identity node fail.");
     return nullptr;
   }
 
@@ -155,7 +265,8 @@ std::string HcclMemcpyPass::CheckDuplicateName(const std::string &node_name) {
 ///
 Status HcclMemcpyPass::ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor,
                                             const InDataAnchorPtr &hccl_in_anchor) {
-  GELOGI("The op %s need insert memcpy async op.", src_out_anchor->GetOwnerNode()->GetName().c_str());
+  GELOGI("Between op %s and op %s need insert memcpy async op.", src_out_anchor->GetOwnerNode()->GetName().c_str(),
+    hccl_in_anchor->GetOwnerNode()->GetName().c_str());
   NodePtr memcpy_node = CreateIdentityNode(graph, src_out_anchor);
   GE_CHECK_NOTNULL(memcpy_node);
 
diff --git a/ge/graph/passes/hccl_memcpy_pass.h b/ge/graph/passes/hccl_memcpy_pass.h
index e73a5483..1e946fa7 100755
--- a/ge/graph/passes/hccl_memcpy_pass.h
+++ b/ge/graph/passes/hccl_memcpy_pass.h
@@ -37,6 +37,14 @@ class HcclMemcpyPass : public GraphPass {
   Status ModifyEdgeConnection(const ComputeGraphPtr &graph, const OutDataAnchorPtr &src_out_anchor,
           const InDataAnchorPtr &hccl_in_anchor);
 
+  Status ContinuousInputProcess(const ComputeGraphPtr &graph, const NodePtr node);
+
+  Status MutableInputProcess(const ComputeGraphPtr &graph, const NodePtr node);
+
+  Status P2pmemInputProcess(const ComputeGraphPtr &graph, const NodePtr node);
+
+  bool IsDataNode(const std::string& node_type);
+
   std::unordered_map<std::string, uint32_t> node_num_map_;
 };
 }  // namespace ge
diff --git a/ge/graph/preprocess/graph_preprocess.cc b/ge/graph/preprocess/graph_preprocess.cc
index 98371426..e290c257 100644
--- a/ge/graph/preprocess/graph_preprocess.cc
+++ b/ge/graph/preprocess/graph_preprocess.cc
@@ -60,7 +60,6 @@
 #include "graph/passes/get_original_format_pass.h"
 #include "graph/passes/guarantee_const_pass.h"
 #include "graph/passes/hccl_group_pass.h"
-#include "graph/passes/hccl_memcpy_pass.h"
 #include "graph/passes/identity_pass.h"
 #include "graph/passes/infershape_pass.h"
 #include "graph/passes/iterator_op_pass.h"
@@ -1693,8 +1692,6 @@ Status GraphPrepare::PrepareOptimize() {
   PassManager graph_pass;
   try {
     (void)graph_pass.AddPass("PrepareOptimize::PrunePass", new PrunePass);
-    // todo 临时把hccl的memcpy插入放到图准备，为了防止其多插memcpy
-    (void)graph_pass.AddPass("PrepareOptimize::HcclMemcpyPass", new (std::nothrow) HcclMemcpyPass);
   } catch (std::bad_alloc &e) {
     GELOGE(INTERNAL_ERROR, "Add pass failed, bad memory allocation occurs.");
     return INTERNAL_ERROR;
diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h
index 4cee1b6f..7193ae0d 100644
--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@@ -245,6 +245,11 @@ const std::string INPUT_FP16_NODES = "ge.INPUT_NODES_SET_FP16";
 // 0: close debug; 1: open TBE compiler; 2: open ccec compiler
 const std::string OP_DEBUG_LEVEL = "ge.opDebugLevel";
 
+// Configure for fix hcombroadcast format.
+// when config model multi, broadcast format should be fixed
+// 0: data multi; 1: model multi;
+const std::string HCOM_MULTI_MODE = "ge.hcomMultiMode";
+
 // Graph run mode
 enum GraphRunMode { PREDICTION = 0, TRAIN };