diff --git a/build.sh b/build.sh
index 6cbc03d2..74f13849 100644
--- a/build.sh
+++ b/build.sh
@@ -176,7 +176,7 @@ cd ${BASEPATH}
 mkdir -p output/plugin/nnengine/ge_config/
 find output/ -name graphengine_lib.tar -exec rm {} \;
 cp src/ge/engine_manager/engine_conf.json output/plugin/nnengine/ge_config/
-find output/ -maxdepth 1 -name libengine.so -exec mv {} output/plugin/nnengine/ \;
+find output/ -maxdepth 1 -name libengine.so -exec mv -f {} output/plugin/nnengine/ \;
 tar -cf graphengine_lib.tar output/*
 mv -f graphengine_lib.tar output
 echo "---------------- GraphEngine package archive generated ----------------"
diff --git a/inc/common/util/compress/compress.h b/inc/common/util/compress/compress.h
new file mode 100644
index 00000000..6908fb75
--- /dev/null
+++ b/inc/common/util/compress/compress.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef COMPRESS_H
+#define COMPRESS_H
+
+#include <uchar.h>
+
+enum CmpStatus { RET_SUCCESS = 0, RET_ERROR = -1 };
+
+struct CompressConfig {
+  size_t inputSize;    // length of data to compress
+  size_t engineNum;    // how many decompress engines
+  size_t maxRatio;     // how much size of a basic compression block, only 64 supported now (8x: 64 4x: 32)
+  size_t channel;      // channels of L2 or DDR. For load balance
+  size_t fractalSize;  // size of compressing block
+  bool isTight;        // whether compose compressed data tightly
+};
+
+CmpStatus CompressWeights(char* input, const CompressConfig& compressConfig, char* indexs, char* output,
+                          size_t& compressedLength);
+
+#endif  // COMPRESS_H
diff --git a/inc/common/util/platform_info.h b/inc/common/util/platform_info.h
new file mode 100644
index 00000000..52dc0621
--- /dev/null
+++ b/inc/common/util/platform_info.h
@@ -0,0 +1,97 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PLATFORM_INFO_H
+#define PLATFORM_INFO_H
+
+#include <map>
+#include <string>
+#include <vector>
+#include "platform_info_def.h"
+
+using std::map;
+using std::string;
+using std::vector;
+
+namespace fe {
+
+class PlatformInfoManager {
+ public:
+  PlatformInfoManager(const PlatformInfoManager &) = delete;
+  PlatformInfoManager &operator=(const PlatformInfoManager &) = delete;
+
+  static PlatformInfoManager &Instance();
+  uint32_t InitializePlatformInfo();
+  uint32_t Finalize();
+
+  uint32_t GetPlatformInfo(const string SoCVersion, PlatformInfo &platformInfo, OptionalInfo &optiCompilationInfo);
+
+  void SetOptionalCompilationInfo(OptionalInfo &optiCompilationInfo);
+
+ private:
+  PlatformInfoManager();
+  ~PlatformInfoManager();
+
+  uint32_t LoadIniFile(string iniFileRealPath);
+
+  void Trim(string &str);
+
+  uint32_t LoadConfigFile(string realPath);
+
+  string RealPath(const std::string &path);
+
+  string GetSoFilePath();
+
+  void ParseVersion(map<string, string> &versionMap, string &socVersion, PlatformInfo &platformInfoTemp);
+
+  void ParseSocInfo(map<string, string> &socInfoMap, PlatformInfo &platformInfoTemp);
+
+  void ParseCubeOfAICoreSpec(map<string, string> &aiCoreSpecMap, PlatformInfo &platformInfoTemp);
+
+  void ParseBufferOfAICoreSpec(map<string, string> &aiCoreSpecMap, PlatformInfo &platformInfoTemp);
+
+  void ParseUBOfAICoreSpec(map<string, string> &aiCoreSpecMap, PlatformInfo &platformInfoTemp);
+
+  void ParseAICoreSpec(map<string, string> &aiCoreSpecMap, PlatformInfo &platformInfoTemp);
+
+  void ParseBufferOfAICoreMemoryRates(map<string, string> &aiCoreMemoryRatesMap, PlatformInfo &platformInfoTemp);
+
+  void ParseAICoreMemoryRates(map<string, string> &aiCoreMemoryRatesMap, PlatformInfo &platformInfoTemp);
+
+  void ParseUBOfAICoreMemoryRates(map<string, string> &aiCoreMemoryRatesMap, PlatformInfo &platformInfoTemp);
+
+  void ParseAICoreintrinsicDtypeMap(map<string, string> &aiCoreintrinsicDtypeMap, PlatformInfo &platformInfoTemp);
+
+  void ParseVectorCoreSpec(map<string, string> &vectorCoreSpecMap, PlatformInfo &platformInfoTemp);
+
+  void ParseVectorCoreMemoryRates(map<string, string> &vectorCoreMemoryRatesMap, PlatformInfo &platformInfoTemp);
+
+  void ParseVectorCoreintrinsicDtypeMap(map<string, string> &vectorCoreintrinsicDtypeMap,
+                                        PlatformInfo &platformInfoTemp);
+
+  uint32_t ParsePlatformInfoFromStrToStruct(map<string, map<string, string>> &contentInfoMap, string &socVersion,
+                                            PlatformInfo &platformInfoTemp);
+
+  uint32_t AssemblePlatformInfoVector(map<string, map<string, string>> &contentInfoMap);
+
+ private:
+  bool initFlag_;
+  map<string, PlatformInfo> platformInfoMap_;
+  OptionalInfo optiCompilationInfo_;
+};
+
+}  // namespace fe
+#endif
diff --git a/inc/common/util/platform_info_def.h b/inc/common/util/platform_info_def.h
new file mode 100644
index 00000000..663a2cae
--- /dev/null
+++ b/inc/common/util/platform_info_def.h
@@ -0,0 +1,122 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PLATFORM_INFO_DEF_H
+#define PLATFORM_INFO_DEF_H
+
+#include <map>
+#include <string>
+#include <vector>
+
+using std::map;
+using std::string;
+using std::vector;
+
+namespace fe {
+enum MemoryType { DDR = 0, HBM };
+
+enum L2Type { Cache = 0, Buff };
+
+typedef struct tagStrInfo {
+  string aicVersion;
+  string ccecAICVersion;
+  string ccecAIVVersion;
+  string isSupportAIcpuCompiler;
+} StrInfo;
+
+typedef struct tagSoCInfo {
+  uint32_t aiCoreCnt;
+  uint32_t vectorCoreCnt;
+  uint32_t aiCpuCnt;
+  MemoryType memoryType;
+  uint64_t memorySize;
+  L2Type l2Type;
+  uint64_t l2Size;
+  uint32_t l2PageNum;
+} SoCInfo;
+
+typedef struct tagAiCoreSpec {
+  double cubeFreq;
+  uint64_t cubeMSize;
+  uint64_t cubeNSize;
+  uint64_t cubeKSize;
+  uint64_t vecCalcSize;
+  uint64_t l0ASize;
+  uint64_t l0BSize;
+  uint64_t l0CSize;
+  uint64_t l1Size;
+  uint64_t smaskBuffer;
+  uint64_t ubSize;
+  uint64_t ubblockSize;
+  uint64_t ubbankSize;
+  uint64_t ubbankNum;
+  uint64_t ubburstInOneBlock;
+  uint64_t ubbankGroupNum;
+} AiCoreSpec;
+
+typedef struct tagAiCoreMemoryRates {
+  double ddrRate;
+  double l2Rate;
+  double l2ReadRate;
+  double l2WriteRate;
+  double l1ToL0ARate;
+  double l1ToL0BRate;
+  double l1ToUBRate;
+  double l0CToUBRate;
+  double ubToL2Rate;
+  double ubToDdrRate;
+  double ubToL1Rate;
+} AiCoreMemoryRates;
+
+typedef struct tagVectorCoreSpec {
+  uint64_t vecCalcSize;
+  uint64_t smaskBuffer;
+  uint64_t ubSize;
+  uint64_t ubblockSize;
+  uint64_t ubbankSize;
+  uint64_t ubbankNum;
+  uint64_t ubburstInOneBlock;
+  uint64_t ubbankGroupNum;
+} VectorCoreSpec;
+
+typedef struct tagVectorCoreMemoryRates {
+  double ddrRate;
+  double l2Rate;
+  double l2ReadRate;
+  double l2WriteRate;
+  double ubToL2Rate;
+  double ubToDdrRate;
+} VectorCoreMemoryRates;
+
+typedef struct tagPlatformInfo {
+  StrInfo strInfo;
+  SoCInfo socInfo;
+  AiCoreSpec aiCoreSpec;
+  AiCoreMemoryRates aiCoreMemoryRates;
+  map<string, vector<string>> aiCoreIntrinsicDtypeMap;
+  VectorCoreSpec vectorCoreSpec;
+  VectorCoreMemoryRates vectorCoreMemoryRates;
+  map<string, vector<string>> vectorCoreIntrinsicDtypeMap;
+} PlatformInfo;
+
+typedef struct tagOptionalInfo {
+  string socVersion;
+  string coreType;
+  uint32_t aiCoreNum;
+  string l1FusionFlag;
+} OptionalInfo;
+}  // namespace fe
+#endif
diff --git a/inc/external/ge/ge_api_types.h b/inc/external/ge/ge_api_types.h
index bf9a10b4..13477bbd 100644
--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@@ -40,6 +40,8 @@ const char *const OPTION_EXEC_EXTERN_PLUGIN_PATH = "ge.soLoadPath";
 const char *const OPTION_EXEC_ENABLE_DUMP = "ge.exec.enableDump";
 const char *const OPTION_EXEC_DUMP_PATH = "ge.exec.dumpPath";
 const char *const OPTION_EXEC_DUMP_STEP = "ge.exec.dumpStep";
+const char *const OPTION_EXEC_ENABLE_INCRE_BUILD = "ge.exec.enableIncreBuild";
+const char *const OPTION_EXEC_INCRE_BUILD_CACHE_PATH = "ge.exec.increBuildCachePath";
 // Hccl flag, if ge.exec.hcclFlag =1, it means load plugin for opskernel, else:ge.exec.hcclFlag =0
 const char *const OPTION_EXEC_HCCL_FLAG = "ge.exec.hcclFlag";
 const char *const OPTION_EXEC_ATOMIC_FLAG = "ge.exec.enable_atomic";
diff --git a/inc/external/register/register.h b/inc/external/register/register.h
index 045a1570..f96044de 100644
--- a/inc/external/register/register.h
+++ b/inc/external/register/register.h
@@ -116,27 +116,5 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpReceiver {
 namespace ge {
 using OpRegistrationData = domi::OpRegistrationData;
 using OpReceiver = domi::OpReceiver;
-
-class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOp {
- public:
-  HostCpuOp() = default;
-  virtual ~HostCpuOp() = default;
-
-  virtual graphStatus Compute(Operator &op, const std::map<std::string, const Tensor> &inputs,
-                              std::map<std::string, Tensor> &outputs) = 0;
-};
-
-class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOpRegistrar {
- public:
-  HostCpuOpRegistrar(const char *op_type, HostCpuOp *(*create_fn)());
-};
-
-#define REGISTER_HOST_CPU_OP_BUILDER(name, op) REGISTER_HOST_CPU_OP_BUILDER_UNIQ_HELPER(__COUNTER__, name, op)
-
-#define REGISTER_HOST_CPU_OP_BUILDER_UNIQ_HELPER(ctr, name, op) REGISTER_HOST_CPU_OP_BUILDER_UNIQ(ctr, name, op)
-
-#define REGISTER_HOST_CPU_OP_BUILDER_UNIQ(ctr, name, op)                              \
-  static ::ge::HostCpuOpRegistrar register_host_cpu_op##ctr __attribute__((unused)) = \
-    ::ge::HostCpuOpRegistrar(name, []() -> ::ge::HostCpuOp * { return new (std::nothrow) op(); })
 }  // namespace ge
 #endif  // INC_EXTERNAL_REGISTER_REGISTER_H_
diff --git a/inc/framework/common/types.h b/inc/framework/common/types.h
index 0adc812d..1cc2245b 100644
--- a/inc/framework/common/types.h
+++ b/inc/framework/common/types.h
@@ -434,6 +434,7 @@ REGISTER_OPTYPE_DECLARE(STREAMSWITCH, "StreamSwitch");
 REGISTER_OPTYPE_DECLARE(STREAMSWITCHN, "StreamSwitchN");
 REGISTER_OPTYPE_DECLARE(STREAMACTIVE, "StreamActive");
 REGISTER_OPTYPE_DECLARE(MEMCPYASYNC, "MemcpyAsync");
+REGISTER_OPTYPE_DECLARE(MEMCPYADDRASYNC, "MemcpyAddrAsync");
 REGISTER_OPTYPE_DECLARE(STREAMMERGE, "StreamMerge");
 REGISTER_OPTYPE_DECLARE(ENDGRAPH, "EndGraph");
 REGISTER_OPTYPE_DECLARE(SEND, "Send");
@@ -441,6 +442,7 @@ REGISTER_OPTYPE_DECLARE(RECV, "Recv");
 
 REGISTER_OPTYPE_DECLARE(LABELSET, "LabelSet");
 REGISTER_OPTYPE_DECLARE(LABELGOTO, "LabelGoto");
+REGISTER_OPTYPE_DECLARE(LABELGOTOEX, "LabelGotoEx");
 REGISTER_OPTYPE_DECLARE(LABELSWITCH, "LabelSwitch");
 REGISTER_OPTYPE_DECLARE(LABELSWITCHBYINDEX, "LabelSwitchByIndex");
 
diff --git a/inc/graph/debug/ge_attr_define.h b/inc/graph/debug/ge_attr_define.h
index 00f5edbd..57d1c6c6 100644
--- a/inc/graph/debug/ge_attr_define.h
+++ b/inc/graph/debug/ge_attr_define.h
@@ -979,9 +979,14 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_N_BATCH_SPILT;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NO_TASK_AND_DUMP_NEEDED;
 
+// functional ops attr
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_WHILE_COND;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_WHILE_BODY;
+
 // used for label switch
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_LABEL_SWITCH_INDEX;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_LABEL_SWITCH_LIST;
+
 // Varible
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REF_VAR_SRC_VAR_NAME;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_SRC_VAR_NAME;
diff --git a/inc/graph/detail/model_serialize_imp.h b/inc/graph/detail/model_serialize_imp.h
index 1d50577c..ad4e6475 100644
--- a/inc/graph/detail/model_serialize_imp.h
+++ b/inc/graph/detail/model_serialize_imp.h
@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
 #include "graph/anchor.h"
-#include "detail/attributes_holder.h"
+#include "graph/detail/attributes_holder.h"
 #include "graph/ge_tensor.h"
 #include "graph/graph.h"
 #include "graph/node.h"
diff --git a/inc/graph/utils/graph_utils.h b/inc/graph/utils/graph_utils.h
index 8066e8b5..fb979e3e 100644
--- a/inc/graph/utils/graph_utils.h
+++ b/inc/graph/utils/graph_utils.h
@@ -262,6 +262,8 @@ class GraphUtils {
   static graphStatus MoveOutCtrlEdges(NodePtr &src_node, NodePtr &dst_node);
 
   static ComputeGraphPtr FindRootGraph(ComputeGraphPtr graph);
+
+  static graphStatus TopologicalSortingByName(const ge::ComputeGraphPtr &compute_graph, vector<NodePtr> &node_vec);
 };
 
 class ComputeGraphBuilder {
diff --git a/src/common/graph/compute_graph.cc b/src/common/graph/compute_graph.cc
index a35747d4..2dcc7a54 100644
--- a/src/common/graph/compute_graph.cc
+++ b/src/common/graph/compute_graph.cc
@@ -54,17 +54,34 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY size_t ComputeGraph::GetAllNodesS
   return s;
 }
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ComputeGraph::Vistor<NodePtr> ComputeGraph::GetAllNodes() const {
-  vector<NodePtr> all_nodes(nodes_.size());
-  (void)std::copy(nodes_.begin(), nodes_.end(), all_nodes.begin());
-  for (const auto &sub_graph : sub_graph_) {
-    if (sub_graph == nullptr) {
-      GELOGW("sub graph is nullptr");
+  if (sub_graph_.empty()) {
+    return Vistor<NodePtr>(shared_from_this(), nodes_);
+  }
+
+  std::vector<NodePtr> all_nodes;
+  std::deque<NodePtr> candidates;
+
+  candidates.insert(candidates.begin(), nodes_.begin(), nodes_.end());
+
+  while (!candidates.empty()) {
+    NodePtr node = candidates.front();
+    all_nodes.emplace_back(node);
+    candidates.pop_front();
+
+    OpDescPtr op_desc = node->GetOpDesc();
+    if (op_desc == nullptr) {
       continue;
     }
-    for (const auto &node : sub_graph->GetAllNodes()) {
-      all_nodes.push_back(node);
+
+    const auto &subgraph_names = op_desc->GetSubgraphInstanceNames();
+    for (auto name_iter = subgraph_names.rbegin(); name_iter != subgraph_names.rend(); ++name_iter) {
+      auto subgraph = GetSubgraph(*name_iter);
+      if (subgraph != nullptr) {
+        candidates.insert(candidates.begin(), subgraph->nodes_.begin(), subgraph->nodes_.end());
+      }
     }
   }
+
   return Vistor<NodePtr>(shared_from_this(), all_nodes);
 }
 size_t ComputeGraph::GetDirectNodesSize() const { return nodes_.size(); }
@@ -602,7 +619,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ComputeGraph::InsertE
 graphStatus ComputeGraph::DFSTopologicalSorting(std::vector<NodePtr> &node_vec,
                                                 std::map<NodePtr, uint32_t> &map_in_edge_num,
                                                 std::vector<NodePtr> &stack) {
-  GELOGI("Runing_Dfs_Sort");
+  GELOGI("Runing_Dfs_Sort: %s", name_.c_str());
   // Record the number of non data nodes but no input nodes
   GE_CHK_BOOL_EXEC(SortNodes(stack, map_in_edge_num) == GRAPH_SUCCESS, return GRAPH_FAILED, "sort nodes failed");
 
@@ -647,7 +664,7 @@ graphStatus ComputeGraph::DFSTopologicalSorting(std::vector<NodePtr> &node_vec,
 graphStatus ComputeGraph::BFSTopologicalSorting(std::vector<NodePtr> &node_vec,
                                                 std::map<NodePtr, uint32_t> &map_in_edge_num,
                                                 std::deque<NodePtr> &stack) {
-  GELOGI("Runing_Bfs_Sort");
+  GELOGI("Runing_Bfs_Sort: %s", name_.c_str());
   std::vector<NodePtr> stack_input;
   std::map<string, NodePtr> breadth_node_map;
   // Record the number of non data nodes but no input nodes
@@ -735,7 +752,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ComputeGraph::Topolog
       use_BFS = true;
     }
   } else {
-    GELOGW("Get OPTION_GRAPH_RUN_MODE failed, use BFSTopologicalSorting by default.");
+    GELOGW("OPTION_GRAPH_RUN_MODE not set, use BFSTopologicalSorting by default.");
   }
 
   if (use_BFS) {
diff --git a/src/common/graph/ge_attr_define.cc b/src/common/graph/ge_attr_define.cc
index 92040051..961d3bc4 100644
--- a/src/common/graph/ge_attr_define.cc
+++ b/src/common/graph/ge_attr_define.cc
@@ -955,11 +955,8 @@ const std::string ATTR_NAME_DATA_DUMP_ORIGIN_FORMAT = "_datadump_origin_format";
 const std::string ATTR_NAME_DATA_DUMP_ORIGIN_DATA_TYPE = "_datadump_origin_data_type";
 
 // functional ops attr
-const std::string ATTR_NAME_TCOND = "Tcond";
-const std::string ATTR_NAME_TIN = "Tin";
-const std::string ATTR_NAME_TOUT = "Tout";
-const std::string ATTR_NAME_THEN_BRANCH = "then_branch";
-const std::string ATTR_NAME_ELSE_BRANCH = "else_branch";
+const std::string ATTR_NAME_WHILE_COND = "cond";
+const std::string ATTR_NAME_WHILE_BODY = "body";
 
 // used for label switch
 const std::string ATTR_NAME_LABEL_SWITCH_INDEX = "_label_switch_index";
diff --git a/src/common/graph/utils/graph_utils.cc b/src/common/graph/utils/graph_utils.cc
index c5e45516..1886ee66 100644
--- a/src/common/graph/utils/graph_utils.cc
+++ b/src/common/graph/utils/graph_utils.cc
@@ -28,6 +28,7 @@
 #include <cstring>
 #include <fstream>
 #include <iomanip>
+#include <queue>
 
 #include "./ge_context.h"
 #include "debug/ge_util.h"
@@ -1999,4 +2000,60 @@ void PartialGraphBuilder::BuildExistNodes(graphStatus &error_code, std::string &
 
   GELOGD("Build exist nodes succ.");
 }
+
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
+GraphUtils::TopologicalSortingByName(const ge::ComputeGraphPtr &compute_graph, vector<NodePtr> &node_vec) {
+  std::vector<NodePtr> stack_input;
+  std::map<NodePtr, uint32_t> map_in_edge_num;
+  graphStatus ret = compute_graph->SortNodes(stack_input, map_in_edge_num);
+  if (ret != GRAPH_SUCCESS) {
+    GELOGE(GRAPH_FAILED, "Sort nodes failed.");
+    return GRAPH_FAILED;
+  }
+  const size_t non_user_input_index = stack_input.size() - compute_graph->inputs_order_.size() - 1;
+  std::sort(stack_input.begin(), stack_input.begin() + non_user_input_index,
+            [](const NodePtr &a, const NodePtr &b) -> bool { return (a->GetName() > b->GetName()); });
+
+  std::queue<NodePtr> stack;
+  NodePtr cur_node = nullptr;
+  std::map<string, NodePtr> name_node_map;
+  vector<string> nodes_name;
+  while (!stack_input.empty() || !stack.empty()) {
+    if (!stack.empty()) {
+      cur_node = stack.front();
+      stack.pop();
+    } else {
+      cur_node = stack_input.back();
+      stack_input.pop_back();
+    }
+    node_vec.emplace_back(cur_node);
+    compute_graph->CollectBreadthOutNode(cur_node, map_in_edge_num, name_node_map);
+    for (const auto &iter : name_node_map) {
+      nodes_name.emplace_back(iter.first);
+    }
+    std::sort(nodes_name.begin(), nodes_name.end());
+    for (const auto &iter : nodes_name) {
+      stack.push(name_node_map[iter]);
+    }
+    name_node_map.clear();
+    nodes_name.clear();
+  }
+  // If they are not equal, there is a closed loop
+  if (node_vec.size() != compute_graph->nodes_.size()) {
+    std::set<Node *> itered_nodes_set;
+    for (auto &node : node_vec) {
+      itered_nodes_set.insert(node.get());
+    }
+    GE_LOGE("Failed to do topo sorting total %zu, itered %zu, exist closed loop in graph.",
+            compute_graph->nodes_.size(), node_vec.size());
+    for (auto &node : compute_graph->nodes_) {
+      if (itered_nodes_set.count(node.get()) == 0) {
+        GE_LOGE("The node %s does not itered when topological sorting", node->GetName().c_str());
+      }
+    }
+    return GRAPH_FAILED;
+  }
+  return GRAPH_SUCCESS;
+}
+
 }  // namespace ge
diff --git a/src/ge/CMakeLists.txt b/src/ge/CMakeLists.txt
index 56e5e2b0..1a3434a5 100755
--- a/src/ge/CMakeLists.txt
+++ b/src/ge/CMakeLists.txt
@@ -41,6 +41,7 @@ include_directories(${GE_SOURCE_DIR}/inc/external/graph)
 include_directories(${GE_SOURCE_DIR}/inc/framework)
 include_directories(${GE_SOURCE_DIR}/inc/framework/common)
 include_directories(${GE_SOURCE_DIR}/inc/runtime)
+include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib)
 include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc)
 include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/cce)
 include_directories(${GE_SOURCE_DIR}/third_party/securec/include)
@@ -55,6 +56,7 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "common/formats/utils/formats_trans_utils.cc"
         "common/fp16_t.cc"
         "common/ge/plugin_manager.cc"
+        "common/helper/model_cache_helper.cc"
         "common/profiling/profiling_manager.cc"
         "engine_manager/dnnengine_manager.cc"
         "ge_local_engine/engine/host_cpu_engine.cc"
@@ -92,6 +94,7 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "graph/load/new_model_manager/task_info/kernel_task_info.cc"
         "graph/load/new_model_manager/task_info/label_goto_task_info.cc"
         "graph/load/new_model_manager/task_info/label_set_task_info.cc"
+        "graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc"
         "graph/load/new_model_manager/task_info/memcpy_async_task_info.cc"
         "graph/load/new_model_manager/task_info/profiler_trace_task_info.cc"
         "graph/load/new_model_manager/task_info/stream_active_task_info.cc"
@@ -269,6 +272,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "common/formats/utils/formats_trans_utils.cc"
         "common/fp16_t.cc"
         "common/ge/plugin_manager.cc"
+        "common/helper/model_cache_helper.cc"
         "common/profiling/profiling_manager.cc"
         "engine_manager/dnnengine_manager.cc"
         "ge_local_engine/engine/host_cpu_engine.cc"
@@ -305,6 +309,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "graph/load/new_model_manager/task_info/kernel_task_info.cc"
         "graph/load/new_model_manager/task_info/label_goto_task_info.cc"
         "graph/load/new_model_manager/task_info/label_set_task_info.cc"
+        "graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc"
         "graph/load/new_model_manager/task_info/memcpy_async_task_info.cc"
         "graph/load/new_model_manager/task_info/profiler_trace_task_info.cc"
         "graph/load/new_model_manager/task_info/stream_active_task_info.cc"
@@ -470,7 +475,7 @@ target_link_libraries(ge_compiler
         ${slog}
         ${mmpa}
         ${msprof}
-        ${runtime}
+        ${runtime_compiler}
         ${resouce}
         rt
         dl)
diff --git a/src/ge/common/formats/format_transfers/datatype_transfer.cc b/src/ge/common/formats/format_transfers/datatype_transfer.cc
index bac3a178..e5d21307 100644
--- a/src/ge/common/formats/format_transfers/datatype_transfer.cc
+++ b/src/ge/common/formats/format_transfers/datatype_transfer.cc
@@ -134,10 +134,6 @@ Status DataTypeTransfer::TransDataType(const CastArgs &args, TransResult &result
   }
   auto trans_mode = iter->second;
 
-  if (args.src_data_size == 0) {
-    GELOGE(PARAM_INVALID, "Invalid src data size %zu", args.src_data_size);
-    return PARAM_INVALID;
-  }
   int size = GetSizeByDataType(args.dst_data_type);
   if (size <= 0) {
     GELOGE(PARAM_INVALID, "Failed to calc size from data type %s",
@@ -149,6 +145,12 @@ Status DataTypeTransfer::TransDataType(const CastArgs &args, TransResult &result
     return PARAM_INVALID;
   }
   size_t total_size = static_cast<size_t>(args.src_data_size * size);
+  result.length = total_size;
+  if (total_size == 0) {
+    GELOGI("In TransDataType, total_size is zero, has no data.");
+    return SUCCESS;
+  }
+
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[total_size], std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to alloc the memory for dst buf %zu, data size %zu", total_size, args.src_data_size);
@@ -162,7 +164,6 @@ Status DataTypeTransfer::TransDataType(const CastArgs &args, TransResult &result
     return INTERNAL_ERROR;
   }
   result.data = dst;
-  result.length = total_size;
   return SUCCESS;
 }
 
diff --git a/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc b/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc
index 3458f83c..40dc749d 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_c1hwncoc0_hwcn.cc
@@ -134,6 +134,11 @@ Status FormatTransferC1hwncoc0Hwcn::TransFormat(const TransArgs &args, TransResu
   int size = GetSizeByDataType(args.src_data_type);
   int64_t total_size = GetItemNumByShape(args.dst_shape) * size;
   if (total_size <= 0) {
+    int64_t src_size = GetItemNumByShape(args.src_shape);
+    if (total_size == 0 && src_size == 0) {
+      result.length = static_cast<size_t>(total_size);
+      return SUCCESS;
+    }
     GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size,
            ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str());
     return PARAM_INVALID;
diff --git a/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc b/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc
index 45808fa0..dc8e1033 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_dhwcn_fracz3D.cc
@@ -88,6 +88,11 @@ Status TransFormatDhwckToFz3D(const TransArgs &args, TransResult &result) {
     dst_size *= dim;
   }
   dst_size *= data_size;
+  if (dst_size == 0) {
+    result.length = static_cast<size_t>(dst_size);
+    return SUCCESS;
+  }
+
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size], std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld",
diff --git a/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc b/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc
index 86c6935d..11e3d270 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_dhwnc_fracz3D_transpose.cc
@@ -89,6 +89,11 @@ Status TransFormatDhwncToFz3DTranspose(const TransArgs &args, TransResult &resul
     dst_size *= dim;
   }
   dst_size *= data_size;
+  if (dst_size == 0) {
+    result.length = static_cast<size_t>(dst_size);
+    return SUCCESS;
+  }
+
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size], std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld",
diff --git a/src/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc b/src/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc
index 76834437..ff7b84a4 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_fractal_nz.cc
@@ -116,6 +116,11 @@ Status CheckShapeRelation(const TransArgs &args, ShapeVector &hw_shape) {
 Status TransFormatFromNdToFracNz(const TransArgs &args, TransResult &result, const ShapeVector &hw_shape) {
   int size = GetSizeByDataType(args.src_data_type);
   int64_t dst_size = GetItemNumByShape(args.dst_shape) * size;
+  if (dst_size == 0) {
+    result.length = static_cast<size_t>(dst_size);
+    return SUCCESS;
+  }
+
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size](), std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld",
@@ -184,6 +189,11 @@ Status TransFormatFromNdToFracNz(const TransArgs &args, TransResult &result, con
 Status TransFormatFromFracNzToNd(const TransArgs &args, TransResult &result, const ShapeVector &dst_hw_shape) {
   int size = GetSizeByDataType(args.src_data_type);
   int64_t dst_size = GetItemNumByShape(args.dst_shape) * size;
+  if (dst_size == 0) {
+    result.length = static_cast<size_t>(dst_size);
+    return SUCCESS;
+  }
+
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size], std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld",
diff --git a/src/ge/common/formats/format_transfers/format_transfer_fractal_z.cc b/src/ge/common/formats/format_transfers/format_transfer_fractal_z.cc
index aedc7589..f3d06496 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_fractal_z.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_fractal_z.cc
@@ -119,6 +119,11 @@ Status TransFormatFromNchwToFz(const TransArgs &args, TransResult &result) {
   int64_t total_ele_cnt = hf_cnt * vf_cnt * fractal_ele_cnt;
   int size = GetSizeByDataType(args.src_data_type);
   int64_t dst_size = total_ele_cnt * size;
+  if (dst_size == 0) {
+    result.length = static_cast<size_t>(dst_size);
+    return SUCCESS;
+  }
+
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size], std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld",
@@ -194,6 +199,11 @@ Status TransFormatHwcnToFz(const TransArgs &args, TransResult &result) {
     dst_size *= dim;
   }
   dst_size *= data_size;
+  if (dst_size == 0) {
+    result.length = static_cast<size_t>(dst_size);
+    return SUCCESS;
+  }
+
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size], std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld",
@@ -259,6 +269,11 @@ Status TransFormatNhwcToFz(const TransArgs &args, TransResult &result) {
     dst_size *= dim;
   }
   dst_size *= data_size;
+  if (dst_size == 0) {
+    result.length = static_cast<size_t>(dst_size);
+    return SUCCESS;
+  }
+
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size], std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld",
diff --git a/src/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc b/src/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc
index 59baccff..d5507765 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_fractal_zz.cc
@@ -117,6 +117,11 @@ Status CheckShapeRelation(const TransArgs &args, ShapeVector &hw_shape) {
 Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, const ShapeVector &hw_shape) {
   int size = GetSizeByDataType(args.src_data_type);
   int64_t dst_size = GetItemNumByShape(args.dst_shape) * size;
+  if (dst_size == 0) {
+    result.length = static_cast<size_t>(dst_size);
+    return SUCCESS;
+  }
+
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size](), std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld",
@@ -153,8 +158,8 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con
           auto src_offset = (src_h_head + w1_idx * w0) * size;
           auto dst_offset = (h0_head + w1_idx * h0w0) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size * w0));
           if (ret != EOK) {
@@ -169,8 +174,8 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con
           auto src_offset = (src_h_head + src_w_idx) * size;
           auto dst_offset = (w0_head + w0_idx) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size));
           if (ret != EOK) {
@@ -189,6 +194,11 @@ Status TransFormatFromNdToFracZz(const TransArgs &args, TransResult &result, con
 Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, const ShapeVector &dst_hw_shape) {
   int size = GetSizeByDataType(args.src_data_type);
   int64_t dst_size = GetItemNumByShape(args.dst_shape) * size;
+  if (dst_size == 0) {
+    result.length = static_cast<size_t>(dst_size);
+    return SUCCESS;
+  }
+
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size](), std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld",
@@ -226,8 +236,8 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con
           auto src_offset = (h0_head + w1_idx * h0w0) * size;
           auto dst_offset = (dst_h_head + w1_idx * w0) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size * w0));
           if (ret != EOK) {
@@ -242,8 +252,8 @@ Status TransFormatFromFracZzToNd(const TransArgs &args, TransResult &result, con
           auto dst_w_idx = w1_head + w0_idx;
           auto dst_offset = (dst_h_head + dst_w_idx) * size;
           auto protected_size = dst_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? dst_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? dst_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size));
           if (ret != EOK) {
diff --git a/src/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc b/src/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc
index 3453c232..b0eebcfa 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_fracz_hwcn.cc
@@ -133,6 +133,12 @@ Status FormatTransferFracZHwcn::TransFormat(const TransArgs &args, TransResult &
   int size = GetSizeByDataType(args.src_data_type);
   auto total_size = GetItemNumByShape(args.dst_shape) * size;
   if (total_size <= 0) {
+    int64_t src_size = GetItemNumByShape(args.src_shape);
+    if (total_size == 0 && src_size == 0) {
+      result.length = static_cast<size_t>(total_size);
+      return SUCCESS;
+    }
+
     GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size,
            ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str());
     return PARAM_INVALID;
diff --git a/src/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc b/src/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc
index 6f616051..9f8d9e39 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_fracz_nchw.cc
@@ -133,6 +133,12 @@ Status FormatTransferFracZNchw::TransFormat(const TransArgs &args, TransResult &
   int size = GetSizeByDataType(args.src_data_type);
   auto total_size = GetItemNumByShape(args.dst_shape) * size;
   if (total_size <= 0) {
+    int64_t src_size = GetItemNumByShape(args.src_shape);
+    if (total_size == 0 && src_size == 0) {
+      result.length = static_cast<size_t>(total_size);
+      return SUCCESS;
+    }
+
     GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size,
            ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str());
     return PARAM_INVALID;
@@ -140,6 +146,7 @@ Status FormatTransferFracZNchw::TransFormat(const TransArgs &args, TransResult &
   GELOGD("Begin to trans format from FracZ to NCHW, src shape %s, data type %s, dst shape %s, memory size %ld",
          ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(),
          ShapeToString(args.dst_shape).c_str(), total_size);
+
   if (GetDstDataAfterTrans(args, result, size, total_size) != SUCCESS) {
     GELOGE(INTERNAL_ERROR, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld",
            ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(),
diff --git a/src/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc b/src/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc
index 57b840af..9a1e5f3b 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_fracz_nhwc.cc
@@ -132,6 +132,12 @@ Status FormatTransferFracZNhwc::TransFormat(const TransArgs &args, TransResult &
   int size = GetSizeByDataType(args.src_data_type);
   auto total_size = GetItemNumByShape(args.dst_shape) * size;
   if (total_size <= 0) {
+    int64_t src_size = GetItemNumByShape(args.src_shape);
+    if (total_size == 0 && src_size == 0) {
+      result.length = static_cast<size_t>(total_size);
+      return SUCCESS;
+    }
+
     GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size,
            ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str());
     return PARAM_INVALID;
diff --git a/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc b/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc
index e7f6754f..7101256a 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_hwcn_c1hwncoc0.cc
@@ -35,7 +35,7 @@ Status TransShapeHwcnToC1hwncoc0(const DataType &data_type, const std::vector<in
                                  std::vector<int64_t> &dst_shape) {
   auto cube_size = GetCubeSizeByDataType(data_type);
   dst_shape.clear();
-  dst_shape.push_back((src_shape.at(kHwcnC) - 1) / cube_size + 1);
+  dst_shape.push_back(Ceil(src_shape.at(kHwcnC), static_cast<int64_t>(cube_size)));
   dst_shape.push_back(src_shape.at(kHwcnH));
   dst_shape.push_back(src_shape.at(kHwcnW));
   dst_shape.push_back(src_shape.at(kHwcnN));
@@ -169,6 +169,12 @@ Status FormatTransferHwcnC1hwncoc0::TransFormat(const TransArgs &args, TransResu
   int size = GetSizeByDataType(args.src_data_type);
   auto total_size = GetItemNumByShape(args.dst_shape) * size;
   if (total_size <= 0) {
+    int64_t src_size = GetItemNumByShape(args.src_shape);
+    if (total_size == 0 && src_size == 0) {
+      result.length = static_cast<size_t>(total_size);
+      return SUCCESS;
+    }
+
     GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size,
            ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str());
     return PARAM_INVALID;
diff --git a/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc b/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc
index eab3ba96..57ab1266 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nchw.cc
@@ -58,7 +58,7 @@ Status CheckArgsForNc1hwc0ToNchw(const TransArgs &args) {
   }
   if (src_shape.at(kNc1hwc0H) != dst_shape.at(kNchwH) || src_shape.at(kNc1hwc0W) != dst_shape.at(kNchwW) ||
       src_shape.at(kNc1hwc0N) != dst_shape.at(kNchwN) || src_shape.at(kNc1hwc0C0) != c0 ||
-      src_shape.at(kNc1hwc0C1) != (dst_shape.at(kNchwC) - 1) / c0 + 1) {
+      src_shape.at(kNc1hwc0C1) != (Ceil(dst_shape.at(kNchwC), c0))) {
     GELOGE(PARAM_INVALID, "Failed to check relationship between src and dst shape, src shape %s, dst shape %s",
            ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str());
     return PARAM_INVALID;
@@ -102,8 +102,8 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in
           auto src_offset = src_idx * size;
           auto dst_offset = dst_idx * size;
           auto protected_size = total_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? total_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? total_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size));
           if (ret != EOK) {
@@ -130,6 +130,12 @@ Status FormatTransferNc1hwc0Nchw::TransFormat(const TransArgs &args, TransResult
   int size = GetSizeByDataType(args.src_data_type);
   auto total_size = GetItemNumByShape(args.dst_shape) * size;
   if (total_size <= 0) {
+    int64_t src_size = GetItemNumByShape(args.src_shape);
+    if (total_size == 0 && src_size == 0) {
+      result.length = static_cast<size_t>(total_size);
+      return SUCCESS;
+    }
+
     GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size,
            ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str());
     return PARAM_INVALID;
diff --git a/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc b/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc
index e9e8b19f..e68e54de 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_nc1hwc0_nhwc.cc
@@ -58,7 +58,7 @@ Status CheckArgsForNc1hwc0ToNhwc(const TransArgs &args) {
   }
   if (src_shape.at(kNc1hwc0H) != dst_shape.at(kNhwcH) || src_shape.at(kNc1hwc0W) != dst_shape.at(kNhwcW) ||
       src_shape.at(kNc1hwc0N) != dst_shape.at(kNhwcN) || src_shape.at(kNc1hwc0C0) != c0 ||
-      src_shape.at(kNc1hwc0C1) != (dst_shape.at(kNhwcC) - 1) / c0 + 1) {
+      src_shape.at(kNc1hwc0C1) != (Ceil(dst_shape.at(kNhwcC), c0))) {
     GELOGE(PARAM_INVALID, "Failed to check relationship between src and dst shape, src shape %s, dst shape %s",
            ShapeToString(src_shape).c_str(), ShapeToString(dst_shape).c_str());
     return PARAM_INVALID;
@@ -102,8 +102,8 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in
           auto src_offset = src_idx * size;
           auto dst_offset = dst_idx * size;
           auto protected_size = total_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                    ? total_size - dst_offset
-                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                  ? total_size - dst_offset
+                                  : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
           auto ret = memcpy_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), args.data + src_offset,
                               static_cast<size_t>(size));
           if (ret != EOK) {
@@ -130,6 +130,12 @@ Status FormatTransferNc1hwc0Nhwc::TransFormat(const TransArgs &args, TransResult
   int size = GetSizeByDataType(args.src_data_type);
   auto total_size = GetItemNumByShape(args.dst_shape) * size;
   if (total_size <= 0) {
+    int64_t src_size = GetItemNumByShape(args.src_shape);
+    if (total_size == 0 && src_size == 0) {
+      result.length = static_cast<size_t>(total_size);
+      return SUCCESS;
+    }
+
     GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size,
            ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str());
     return PARAM_INVALID;
diff --git a/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc b/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc
index 481a64e9..638cc9eb 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_nchw_fz_c04.cc
@@ -134,6 +134,10 @@ Status TransFormatFromNchwToFzC04(const TransArgs &args, TransResult &result) {
                   GELOGE(INTERNAL_ERROR, "int64 mul overflow.A[%lld], B[%lld]", total_ele_cnt, size);
                   return INTERNAL_ERROR);
   int64_t dst_size = total_ele_cnt * size;
+  if (dst_size == 0) {
+    result.length = 0;
+    return SUCCESS;
+  }
 
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size], std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
@@ -219,6 +223,10 @@ Status PaddingNC(const TransArgs &args, TransArgs &args_tmp, std::shared_ptr<uin
                   return INTERNAL_ERROR);
 
   int64_t dst_size = total_ele_cnt * size;
+  if (dst_size == 0) {
+    return SUCCESS;
+  }
+
   dst.reset(new (std::nothrow) uint8_t[dst_size], std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY, "Failed to trans format from %s to %s, can not alloc the memory for dst buf %ld",
diff --git a/src/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc b/src/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc
index 13e48f8c..b4e92cbc 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_nchw_nc1hwc0.cc
@@ -40,7 +40,7 @@ Status TransShapeNchwToNc1hwc0(const std::vector<int64_t> &src_shape, DataType d
   }
   dst_shape.clear();
   dst_shape.push_back(src_shape.at(kNchwN));
-  dst_shape.push_back((src_shape.at(kNchwC) - 1) / c0 + 1);
+  dst_shape.push_back(Ceil(src_shape.at(kNchwC), c0));
   dst_shape.push_back(src_shape.at(kNchwH));
   dst_shape.push_back(src_shape.at(kNchwW));
   dst_shape.push_back(c0);
@@ -74,25 +74,8 @@ Status CheckArgsForNchwToNc1hwc0(const TransArgs &args) {
 
   return SUCCESS;
 }
-}  // namespace
 
-Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult &result) {
-  if (CheckArgsForNchwToNc1hwc0(args) != SUCCESS) {
-    return PARAM_INVALID;
-  }
-  // Guarantee the validity of parameters in check function
-  int size = GetSizeByDataType(args.src_data_type);
-  auto total_size = GetItemNumByShape(args.dst_shape) * size;
-  if (total_size <= 0) {
-    GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size,
-           ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str());
-    return PARAM_INVALID;
-  }
-  GELOGD(
-      "Begin to trans format from NCHW to NC1HWC0, src shape %s, data type "
-      "%s, dst shape %s memory size %ld",
-      ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(),
-      ShapeToString(args.dst_shape).c_str(), total_size);
+Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const int size, const int64_t total_size) {
   std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[total_size], std::default_delete<uint8_t[]>());
   if (dst == nullptr) {
     GELOGE(OUT_OF_MEMORY,
@@ -132,8 +115,8 @@ Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult
             int64_t dst_index = c0_idx + w_head_addr;
             int64_t dst_offset = dst_index * size;
             auto protected_size = total_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                      ? total_size - dst_offset
-                                      : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                    ? total_size - dst_offset
+                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
             int64_t cIdx = c0_idx + c1_idx * c0;
             int64_t srcIdx = n_idx * chw + cIdx * hw + h_idx * w + w_idx;
             auto src_offset = srcIdx * size;
@@ -150,7 +133,7 @@ Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult
               }
             } else {
               auto ret =
-                  memset_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), 0, static_cast<size_t>(size));
+                memset_s(dst.get() + dst_offset, static_cast<size_t>(protected_size), 0, static_cast<size_t>(size));
               if (ret != EOK) {
                 GELOGE(INTERNAL_ERROR,
                        "Failed to set to 0 to "
@@ -169,6 +152,39 @@ Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult
   result.length = static_cast<size_t>(total_size);
   return SUCCESS;
 }
+}  // namespace
+
+Status FormatTransferNchwNc1hwc0::TransFormat(const TransArgs &args, TransResult &result) {
+  if (CheckArgsForNchwToNc1hwc0(args) != SUCCESS) {
+    return PARAM_INVALID;
+  }
+  // Guarantee the validity of parameters in check function
+  int size = GetSizeByDataType(args.src_data_type);
+  auto total_size = GetItemNumByShape(args.dst_shape) * size;
+  if (total_size <= 0) {
+    int64_t src_size = GetItemNumByShape(args.src_shape);
+    if (total_size == 0 && src_size == 0) {
+      result.length = static_cast<size_t>(total_size);
+      return SUCCESS;
+    }
+
+    GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size,
+           ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str());
+    return PARAM_INVALID;
+  }
+  GELOGD(
+    "Begin to trans format from NCHW to NC1HWC0, src shape %s, data type "
+    "%s, dst shape %s memory size %ld",
+    ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(),
+    ShapeToString(args.dst_shape).c_str(), total_size);
+  if (GetDstDataAfterTrans(args, result, size, total_size) != SUCCESS) {
+    GELOGE(INTERNAL_ERROR, "Failed to get data after trans, src shape %s, data type %s, dst shape %s, memory size %ld",
+           ShapeToString(args.src_shape).c_str(), TypeUtils::DataTypeToSerialString(args.src_data_type).c_str(),
+           ShapeToString(args.dst_shape).c_str(), total_size);
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
 
 Status FormatTransferNchwNc1hwc0::TransShape(Format src_format, const std::vector<int64_t> &src_shape,
                                              DataType data_type, Format dst_format, std::vector<int64_t> &dst_shape) {
diff --git a/src/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc b/src/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc
index b461e270..a5be94ff 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_nhwc_nc1hwc0.cc
@@ -38,7 +38,7 @@ Status TransShapeNhwcToNc1hwc0(const std::vector<int64_t> &src_shape, DataType d
   }
   dst_shape.clear();
   dst_shape.push_back(src_shape.at(kNhwcN));
-  dst_shape.push_back((src_shape.at(kNhwcC) - 1) / c0 + 1);
+  dst_shape.push_back(Ceil(src_shape.at(kNhwcC), c0));
   dst_shape.push_back(src_shape.at(kNhwcH));
   dst_shape.push_back(src_shape.at(kNhwcW));
   dst_shape.push_back(c0);
@@ -119,8 +119,8 @@ Status GetDstDataAfterTrans(const TransArgs &args, TransResult &result, const in
             int64_t dst_idx = c0_idx + w_head_addr;
             int64_t dst_offset = dst_idx * size;
             auto protected_size = total_size - dst_offset < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                                      ? total_size - dst_offset
-                                      : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                                    ? total_size - dst_offset
+                                    : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
             int64_t c_idx = c0_idx + c1_idx * c0;
             int64_t src_idx = n_idx * hwc + h_idx * wc + w_idx * c + c_idx;
             auto src_offset = src_idx * size;
@@ -161,6 +161,12 @@ Status FormatTransferNhwcNc1hwc0::TransFormat(const TransArgs &args, TransResult
   int size = GetSizeByDataType(args.src_data_type);
   auto total_size = GetItemNumByShape(args.dst_shape) * size;
   if (total_size <= 0) {
+    int64_t src_size = GetItemNumByShape(args.src_shape);
+    if (total_size == 0 && src_size == 0) {
+      result.length = static_cast<size_t>(total_size);
+      return SUCCESS;
+    }
+
     GELOGE(INTERNAL_ERROR, "Get %ld total size from dst shape %s, src shape %s", total_size,
            ShapeToString(args.dst_shape).c_str(), ShapeToString(args.src_shape).c_str());
     return PARAM_INVALID;
diff --git a/src/ge/common/formats/format_transfers/format_transfer_transpose.cc b/src/ge/common/formats/format_transfers/format_transfer_transpose.cc
index a523a326..ec309543 100644
--- a/src/ge/common/formats/format_transfers/format_transfer_transpose.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_transpose.cc
@@ -27,22 +27,22 @@ namespace ge {
 namespace formats {
 namespace {
 std::map<Format, std::map<Format, std::vector<int64_t>>> perm_args{
-    {FORMAT_NCHW,
-     {{FORMAT_NHWC, std::vector<int64_t>({0, 2, 3, 1})},
-      {FORMAT_HWCN, std::vector<int64_t>({2, 3, 1, 0})},
-      {FORMAT_CHWN, std::vector<int64_t>({1, 2, 3, 0})}}},
-    {FORMAT_NHWC,
-     {{FORMAT_NCHW, std::vector<int64_t>({0, 3, 1, 2})},
-      {FORMAT_CHWN, std::vector<int64_t>({3, 1, 2, 0})},
-      {FORMAT_HWCN, std::vector<int64_t>({1, 2, 3, 0})}}},
-    {FORMAT_HWCN,
-     {{FORMAT_NCHW, std::vector<int64_t>({3, 2, 0, 1})},
-      {FORMAT_NHWC, std::vector<int64_t>({3, 0, 1, 2})},
-      {FORMAT_CHWN, std::vector<int64_t>({2, 0, 1, 3})}}},
-    {FORMAT_CHWN,
-     {{FORMAT_NCHW, std::vector<int64_t>({3, 0, 1, 2})},
-      {FORMAT_NHWC, std::vector<int64_t>({3, 1, 2, 0})},
-      {FORMAT_HWCN, std::vector<int64_t>({1, 2, 0, 3})}}},
+  {FORMAT_NCHW,
+   {{FORMAT_NHWC, std::vector<int64_t>({0, 2, 3, 1})},
+    {FORMAT_HWCN, std::vector<int64_t>({2, 3, 1, 0})},
+    {FORMAT_CHWN, std::vector<int64_t>({1, 2, 3, 0})}}},
+  {FORMAT_NHWC,
+   {{FORMAT_NCHW, std::vector<int64_t>({0, 3, 1, 2})},
+    {FORMAT_CHWN, std::vector<int64_t>({3, 1, 2, 0})},
+    {FORMAT_HWCN, std::vector<int64_t>({1, 2, 3, 0})}}},
+  {FORMAT_HWCN,
+   {{FORMAT_NCHW, std::vector<int64_t>({3, 2, 0, 1})},
+    {FORMAT_NHWC, std::vector<int64_t>({3, 0, 1, 2})},
+    {FORMAT_CHWN, std::vector<int64_t>({2, 0, 1, 3})}}},
+  {FORMAT_CHWN,
+   {{FORMAT_NCHW, std::vector<int64_t>({3, 0, 1, 2})},
+    {FORMAT_NHWC, std::vector<int64_t>({3, 1, 2, 0})},
+    {FORMAT_HWCN, std::vector<int64_t>({1, 2, 0, 3})}}},
 };
 
 bool IsShapeArgValid(const std::vector<int64_t> &src_shape, const std::vector<int64_t> &perm_arg) {
@@ -51,8 +51,8 @@ bool IsShapeArgValid(const std::vector<int64_t> &src_shape, const std::vector<in
     return false;
   }
   for (auto dim : src_shape) {
-    if (dim <= 0) {
-      GELOGE(PARAM_INVALID, "Failed to transpose, zero dim in src shape %s", ShapeToString(src_shape).c_str());
+    if (dim < 0) {
+      GELOGE(PARAM_INVALID, "Failed to transpose, negative dim in src shape %s", ShapeToString(src_shape).c_str());
       return false;
     }
   }
@@ -146,20 +146,24 @@ Status Transpose(const uint8_t *src, const std::vector<int64_t> &src_shape, Data
   int64_t dst_ele_num = GetItemNumByShape(dst_shape);
   int64_t data_size = GetSizeByDataType(src_data_type);
   int64_t dst_size = data_size * dst_ele_num;
-  std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size], std::default_delete<uint8_t[]>());
 
   GELOGD("Begin to transpose, src shape %s, perm arg %s, dst shape %s, data type %s", JoinToString(src_shape).c_str(),
          JoinToString(perm_arg).c_str(), JoinToString(dst_shape).c_str(),
          TypeUtils::DataTypeToSerialString(src_data_type).c_str());
+  if (dst_ele_num == 0) {
+    result.length = static_cast<size_t>(dst_size);
+    return SUCCESS;
+  }
 
+  std::shared_ptr<uint8_t> dst(new (std::nothrow) uint8_t[dst_size], std::default_delete<uint8_t[]>());
   int64_t dst_index = 0;
   std::vector<int64_t> dst_indexes(dst_shape.size());
   while (dst_index < dst_ele_num) {
     auto src_offset = GenOffset(src_heads, dst_indexes) * data_size;
     auto dst_offset_bytes = dst_index * data_size;
     auto protected_size = dst_size - dst_offset_bytes < static_cast<int64_t>(SECUREC_MEM_MAX_LEN)
-                              ? dst_size - dst_offset_bytes
-                              : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
+                            ? dst_size - dst_offset_bytes
+                            : static_cast<int64_t>(SECUREC_MEM_MAX_LEN);
     auto ret = memcpy_s(dst.get() + dst_offset_bytes, static_cast<size_t>(protected_size), src + src_offset,
                         static_cast<size_t>(data_size));
     if (ret != EOK) {
diff --git a/src/ge/common/formats/formats.cc b/src/ge/common/formats/formats.cc
index 938f0888..d01d055b 100644
--- a/src/ge/common/formats/formats.cc
+++ b/src/ge/common/formats/formats.cc
@@ -24,6 +24,7 @@
 #include <string>
 #include <vector>
 
+#include "common/formats/utils/formats_trans_utils.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/ge_inner_error_codes.h"
 #include "graph/utils/type_utils.h"
@@ -38,10 +39,13 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Status TransFormat(const TransArg
            TypeUtils::FormatToSerialString(args.dst_format).c_str());
     return UNSUPPORTED;
   }
-  if (args.data == nullptr) {
+
+  auto src_shape_size = GetItemNumByShape(args.src_shape);
+  if (args.data == nullptr && src_shape_size != 0) {
     GELOGE(PARAM_INVALID, "Invalid input null data");
     return PARAM_INVALID;
   }
+
   return transfer->TransFormat(args, result);
 }
 
@@ -71,6 +75,12 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Status TransDataType(const CastAr
            TypeUtils::DataTypeToSerialString(args.dst_data_type).c_str());
     return UNSUPPORTED;
   }
+
+  if (args.data == nullptr && args.src_data_size != 0) {
+    GELOGE(PARAM_INVALID, "Invalid input null data");
+    return PARAM_INVALID;
+  }
+
   return transfer->TransDataType(args, result);
 }
 
diff --git a/src/ge/common/formats/utils/formats_trans_utils.cc b/src/ge/common/formats/utils/formats_trans_utils.cc
index 35a0a073..23da0f74 100644
--- a/src/ge/common/formats/utils/formats_trans_utils.cc
+++ b/src/ge/common/formats/utils/formats_trans_utils.cc
@@ -69,11 +69,11 @@ bool IsShapeValid(const std::vector<int64_t> &shape) {
   }
   int64_t num = 1;
   for (auto dim : shape) {
-    if (dim < 1) {
-      GELOGE(PARAM_INVALID, "Invalid zero dim in the shape %s", ShapeToString(shape).c_str());
+    if (dim < 0) {
+      GELOGE(PARAM_INVALID, "Invalid negative dim in the shape %s", ShapeToString(shape).c_str());
       return false;
     }
-    if (kShapeItemNumMAX / dim < num) {
+    if (dim != 0 && kShapeItemNumMAX / dim < num) {
       GELOGE(PARAM_INVALID, "Shape overflow, the total count should be less than %ld!", kShapeItemNumMAX);
       return false;
     }
diff --git a/src/ge/common/formats/utils/formats_trans_utils.h b/src/ge/common/formats/utils/formats_trans_utils.h
index 310aaf38..a8fbd09b 100644
--- a/src/ge/common/formats/utils/formats_trans_utils.h
+++ b/src/ge/common/formats/utils/formats_trans_utils.h
@@ -64,6 +64,9 @@ bool IsShapeEqual(const GeShape &src, const GeShape &dst);
 
 template <typename T>
 T Ceil(T n1, T n2) {
+  if (n1 == 0) {
+    return 0;
+  }
   return (n2 != 0) ? (n1 - 1) / n2 + 1 : 0;
 }
 
diff --git a/src/ge/common/helper/model_cache_helper.cc b/src/ge/common/helper/model_cache_helper.cc
new file mode 100644
index 00000000..58c82138
--- /dev/null
+++ b/src/ge/common/helper/model_cache_helper.cc
@@ -0,0 +1,1707 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <climits>
+#include <cstdio>
+#include <fstream>
+#include <functional>
+
+#include "common/ge/ge_util.h"
+#include "common/helper/model_cache_helper.h"
+#include "common/types.h"
+#include "framework/common/debug/ge_log.h"
+#include "framework/common/ge_types.h"
+#include "framework/common/helper/model_helper.h"
+#include "framework/common/util.h"
+#include "graph/detail/attributes_holder.h"
+#include "graph/detail/model_serialize_imp.h"
+#include "graph/load/new_model_manager/davinci_model_parser.h"
+#include "graph/model.h"
+#include "graph/utils/graph_utils.h"
+#include "graph/utils/tensor_utils.h"
+#include "init/gelib.h"
+#include "proto/ge_ir.pb.h"
+
+using namespace std;
+
+namespace {
+const char *const kGraphName = "temp_name";
+const char *const kDpop = "DPOP";
+const char *const kDpopFunction = "dpop_function";
+// Keys of json
+const char *const kNodeNum = "nodeNum";
+const char *const kEdgeNum = "edgeNum";
+const char *const kGraphHash = "graphHash";
+const char *const kNodeHash = "nodeHash";
+const char *const kHash = "hash";
+const char *const kSessionId = "sessionId";
+const char *const kDeviceId = "deviceId";
+const char *const kJobId = "jobId";
+const char *const kGraphMemMaxSize = "graphMemMaxSize";
+const char *const kVarMemMaxSize = "varMemMaxSize";
+const char *const kVarMemLogicBase = "varMemLogicBase";
+const char *const kUseMaxMemSize = "useMaxMemSize";
+const char *const kMemResourceMap = "memResourceMap";
+const char *const kMemType = "memType";
+const char *const kTotalSize = "totalSize";
+const char *const kVarMemSize = "varMemSize";
+const char *const kVarResource = "varResource";
+const char *const kVarAddrMgrMap = "varAddrMgrMap";
+const char *const kName = "name";
+const char *const kAddress = "address";
+const char *const kOffset = "offset";
+const char *const kMemoryType = "memoryType";
+const char *const kTensorDesc = "tensorDesc";
+const char *const kDataType = "dataType";
+const char *const kShape = "shape";
+const char *const kLayout = "layout";
+const char *const kOriginDataType = "originDataType";
+const char *const kOriginShape = "originShape";
+const char *const kOriginLayout = "originLayout";
+const char *const kRealDimCnt = "realDimCnt";
+const char *const kCurVarTensorDescMap = "curVarTensorDescMap";
+const char *const kTransRoads = "transRoads";
+const char *const kTransRoad = "transRoad";
+const char *const kNodeType = "nodeType";
+const char *const kInputTensorDesc = "inputTensorDesc";
+const char *const kOutputTensorDesc = "outputTensorDesc";
+const char *const kChangedGraphId = "changedGraphId";
+const char *const kAllocatedGraphId = "allocatedGraphId";
+const char *const kGraphId = "graphId";
+const char *const kVarBroadcastInfo = "varBroadcastInfo";
+const char *const kBroadcastName = "broadcastName";
+const char *const kIdx = "idx";
+const char *const kInputOffset = "inputOffset";
+const char *const kInputSize = "inputSize";
+const char *const kOutputOffset = "outputOffset";
+const char *const kOutputSize = "outputSize";
+// Suffix of cache files
+const char *const kBeforeVarManagerSuffix = "_before_build_var_manager.json";
+const char *const kAfterVarManagerSuffix = "_after_build_var_manager.json";
+const char *const kManifestSuffix = ".manifest";
+const char *const kOmSuffix = ".om";
+}  // namespace
+
+namespace ge {
+map<uint32_t, uint32_t> ModelCacheHelper::graph_id_run_times_;
+ModelCacheHelper::ModelCacheHelper(uint64_t session_id, uint32_t graph_id, ComputeGraphPtr &compute_graph)
+    : session_id_(session_id),
+      graph_id_(graph_id),
+      compute_graph_(compute_graph),
+      is_cache_path_valid_for_output(false) {
+  if (graph_id_run_times_.count(graph_id) == 0) {
+    graph_id_run_times_[graph_id] = 1;
+  } else {
+    graph_id_run_times_[graph_id] = graph_id_run_times_[graph_id] + 1;
+  }
+  for (const auto &node : compute_graph_->GetDirectNode()) {
+    bool is_variable = (node->GetType() == VARIABLE) || (node->GetType() == VARIABLEV2) ||
+                       (node->GetType() == VARHANDLEOP) || (node->GetType() == CONSTANTOP);
+    if (!is_variable) {
+      continue;
+    }
+    var_names_.insert(node->GetName());
+  }
+  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
+  if (instance_ptr != nullptr && instance_ptr->IsIncreBuild()) {
+    std::string cache_path = instance_ptr->GetIncreBuildCachePath();
+    GELOGD("Incre build path conf: %s", cache_path.c_str());
+    string fake_file_path = cache_path + to_string(graph_id_) + kManifestSuffix;
+    if (CheckOutputPathValid(fake_file_path)) {
+      is_cache_path_valid_for_output = true;
+    } else {
+      GELOGW("Invalid cache path for output.");
+    }
+    std::string real_cache_path = RealPath(cache_path.c_str());
+    if (real_cache_path.empty()) {
+      GELOGW("Invalid incre build cache path conf: %s", cache_path.c_str());
+      return;
+    }
+    cache_path_ = real_cache_path + '/';
+    GELOGD("Try to use incre build cache path: %s", cache_path_.c_str());
+  }
+}
+
+bool ModelCacheHelper::IsModelCacheHit() const {
+  CacheInfo cache_info;
+  if (GetCacheInfo(cache_info) != SUCCESS) {
+    GELOGI("Get cache info of graph id[%u] failed.", graph_id_);
+    return false;
+  }
+  // Check number of nodes and edges first.
+  if (cache_info.node_num != compute_graph_->GetDirectNodesSize()) {
+    GELOGI("Graph id[%u] cache miss: the node number of the graph does not match the cache info.", graph_id_);
+    return false;
+  }
+  size_t edge_num = 0;
+  for (const auto &node : compute_graph_->GetDirectNode()) {
+    for (const auto &anchor : node->GetAllInAnchors()) {
+      edge_num += anchor->GetPeerAnchors().size();
+    }
+  }
+  if (cache_info.edge_num != edge_num) {
+    GELOGI("Graph id[%u] cache miss: the edge number of the graph does not match the cache info.", graph_id_);
+    return false;
+  }
+  size_t compute_graph_hash;
+  auto ret = GetComputeGraphHash(compute_graph_hash);
+  if (ret != SUCCESS || cache_info.graph_hash != compute_graph_hash) {
+    GELOGI("Graph id[%u] cache miss: the hash code of the graph does not match the cache info.", graph_id_);
+    return false;
+  }
+  if (!IsNodeHashSameAsCache(cache_info.nodes_hash)) {
+    GELOGI("Graph id[%u] cache miss: the hash code of node does not match the cache info.", graph_id_);
+    return false;
+  }
+
+  string var_manager_cache = to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kBeforeVarManagerSuffix;
+  Json var_manager_json;
+  if (LoadJsonFromFile(var_manager_cache, var_manager_json) != SUCCESS) {
+    GELOGW("Fail to load json from cache file: %s", var_manager_cache.c_str());
+    return false;
+  }
+  if (!IsVarManagerSameAsCache(var_manager_json)) {
+    GELOGI("Graph id[%u] cache miss: the VarManager dos not match the cache info.", graph_id_);
+    return false;
+  }
+  GELOGI("Graph id[%u] cache hit.", graph_id_);
+  return true;
+}
+
+Status ModelCacheHelper::RefreshComputeGraph(const ComputeGraphPtr &compute_graph) {
+  if (compute_graph->IsValid()) {
+    compute_graph_ = compute_graph;
+    var_names_.clear();
+    for (const auto &node : compute_graph_->GetDirectNode()) {
+      bool is_variable = (node->GetType() == VARIABLE) || (node->GetType() == VARIABLEV2) ||
+                         (node->GetType() == VARHANDLEOP) || (node->GetType() == CONSTANTOP);
+      if (!is_variable) {
+        continue;
+      }
+      var_names_.insert(node->GetName());
+    }
+    return SUCCESS;
+  } else {
+    GELOGW("Invalid compute graph.");
+    return FAILED;
+  }
+}
+
+Status ModelCacheHelper::ClearCache(uint32_t graph_id) const {
+  if (!is_cache_path_valid_for_output) {
+    GELOGW("Invalid cache path.");
+    return SUCCESS;
+  }
+  string manifest_file = cache_path_ + to_string(graph_id) + kManifestSuffix;
+  string manifest_file_path = RealPath(manifest_file.c_str());
+  int ret;
+  if (!manifest_file_path.empty()) {
+    ret = remove(manifest_file_path.c_str());
+    // If remove file failed, print the warning log
+    if (ret != 0) {
+      GELOGW("Clear cache [%s] failed.", manifest_file_path.c_str());
+    }
+  }
+  string before_var_manager_file = cache_path_ + to_string(graph_id) + kManifestSuffix;
+  string before_var_manager_file_path = RealPath(before_var_manager_file.c_str());
+  if (!before_var_manager_file_path.empty()) {
+    ret = remove(before_var_manager_file_path.c_str());
+    if (ret != 0) {
+      GELOGW("Clear cache [%s] failed.", before_var_manager_file_path.c_str());
+    }
+  }
+  string after_var_manager_file = cache_path_ + to_string(graph_id) + kManifestSuffix;
+  string after_var_manager_file_path = RealPath(after_var_manager_file.c_str());
+  if (!after_var_manager_file_path.empty()) {
+    ret = remove(after_var_manager_file_path.c_str());
+    if (ret != 0) {
+      GELOGW("Clear cache [%s] failed.", after_var_manager_file_path.c_str());
+    }
+  }
+  string om_file = cache_path_ + to_string(graph_id) + kManifestSuffix;
+  string om_file_path = RealPath(om_file.c_str());
+  if (!om_file_path.empty()) {
+    ret = remove(om_file_path.c_str());
+    if (ret != 0) {
+      GELOGW("Clear cache [%s] failed.", om_file_path.c_str());
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::RecoverVarManagerFromCache() const {
+  string var_manager_cache = to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kAfterVarManagerSuffix;
+  Json var_manager_json;
+  if (LoadJsonFromFile(var_manager_cache, var_manager_json) != SUCCESS) {
+    GELOGW("Fail to load json from cache file: %s", var_manager_cache.c_str());
+    return FAILED;
+  }
+
+  Json mem_resource_json = move(var_manager_json[kMemResourceMap]);
+  auto ret = RecoverMemResource(mem_resource_json);
+  if (ret != SUCCESS) {
+    GELOGW("Recover VarManager from cache failed.[MemResource]");
+    return FAILED;
+  }
+  Json var_resource_json = move(var_manager_json[kVarResource]);
+  ret = RecoverAllocatedGraphId(var_resource_json[kAllocatedGraphId]);
+  if (ret != SUCCESS) {
+    GELOGW("Recover VarManager from cache failed.[AllocatedGraphId]");
+    return FAILED;
+  }
+  ret = RecoverChangedGraphId(var_resource_json[kChangedGraphId]);
+  if (ret != SUCCESS) {
+    GELOGW("Recover VarManager from cache failed.[ChangedGraphId]");
+    return FAILED;
+  }
+  ret = RecoverBroadcastInfo(var_resource_json[kVarBroadcastInfo]);
+  if (ret != SUCCESS) {
+    GELOGW("Recover VarManager from cache failed.[VarBroadcastInfo]");
+    return FAILED;
+  }
+  ret = RecoverVarAddrAndTensorDesc(var_resource_json[kVarAddrMgrMap]);
+  if (ret != SUCCESS) {
+    GELOGW("Recover VarManager from cache failed.[VarAddrMgrMap & CurVarTensorDesc]");
+    return FAILED;
+  }
+  ret = RecoverTransRoads(var_resource_json[kTransRoads]);
+  if (ret != SUCCESS) {
+    GELOGW("Recover VarManager from cache failed.[TransRoads]");
+    return FAILED;
+  }
+  GELOGI("Recover VarManager from cache[%s] success.", cache_path_.c_str());
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::RecompileNodes(GeModelPtr &ge_model) {
+  std::shared_ptr<GELib> instance = ge::GELib::GetInstance();
+  if (instance == nullptr || !instance->InitFlag()) {
+    GELOGW("RecompileNodes failed.");
+    return ge::GE_CLI_GE_NOT_INITIALIZED;
+  }
+  auto compute_graph = GraphUtils::GetComputeGraph(ge_model->GetGraph());
+  vector<NodePtr> nodes;
+  for (auto &node : compute_graph->GetDirectNode()) {
+    if (node == nullptr) {
+      continue;
+    }
+    auto op_desc = node->GetOpDesc();
+    if (op_desc == nullptr) {
+      continue;
+    }
+
+    string kernel_lib_name = op_desc->GetOpKernelLibName();
+    if (kernel_lib_name.empty()) {
+      // reset op kernel lib
+      (void)instance->DNNEngineManagerObj().GetDNNEngineName(op_desc);
+      kernel_lib_name = op_desc->GetOpKernelLibName();
+      if (kernel_lib_name.empty()) {
+        GELOGW("Get node:%s, type:%s kernel lib failed.", node->GetName().c_str(), op_desc->GetType().c_str());
+        return INTERNAL_ERROR;
+      }
+    }
+    OpsKernelInfoStorePtr kernel_info = instance->OpsKernelManagerObj().GetOpsKernelInfoStore(kernel_lib_name);
+    if (kernel_info == nullptr) {
+      GELOGW("Get op %s ops kernel info store failed", node->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+    auto ge_desc = MakeShared<ge::OpDescPtr>(op_desc);
+    if (ge_desc == nullptr) {
+      GELOGE(GE_GRAPH_MEMORY_ALLOC_FAILED, "Fail to malloc op desc.");
+      return FAILED;
+    }
+    // TBE compile op
+    vector<ge::NodePtr> node_vec = {node};
+    auto ret = kernel_info->CompileOp(node_vec);
+    if (ret != ge::SUCCESS) {
+      GELOGW("Compile single op failed, node name is %s", node->GetName().c_str());
+      return ret;
+    }
+  }
+  // Reset TBE Kernels
+  TBEKernelStore tbe_kernel_store;
+  for (const ge::NodePtr &n : compute_graph->GetDirectNode()) {
+    auto node_op_desc = n->GetOpDesc();
+    GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
+    TBEKernelPtr tbe_kernel = node_op_desc->TryGetExtAttr(ge::OP_EXTATTR_NAME_TBE_KERNEL, TBEKernelPtr());
+    GE_IF_BOOL_EXEC(tbe_kernel == nullptr, continue);
+    tbe_kernel_store.AddTBEKernel(tbe_kernel);
+    GELOGD("Add tbe kernel bin %s", tbe_kernel->GetName().c_str());
+  }
+  if (!tbe_kernel_store.Build()) {
+    GELOGW("TBE Kernels store build failed!");
+    return FAILED;
+  }
+  ge_model->SetTBEKernelStore(tbe_kernel_store);
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetNodesHash(map<std::string, size_t> &hash_map) const {
+  vector<NodePtr> nodes;
+  GraphUtils::TopologicalSortingByName(compute_graph_, nodes);
+  ModelSerializeImp model_serialize_imp;
+  std::hash<string> node_hash;
+  for (const auto &node : nodes) {
+    if (node == nullptr) {
+      continue;
+    }
+    proto::OpDef op_def;
+    bool is_framework_op = (node->GetType() == FRAMEWORKOP);
+    string type;
+    bool is_dpop = false;
+    string origin_dpop_name;
+    if (is_framework_op) {
+      if (AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_FRAMEWORK_ORIGINAL_TYPE, type)) {
+        GELOGI("Get original type of framework op[%s], %s.", node->GetName().c_str(), type.c_str());
+        if (type == kDpop) {
+          GELOGI("DPOP op found:%s.", node->GetName().c_str());
+          origin_dpop_name = node->GetName();
+          node->GetOpDesc()->SetName(kDpopFunction);
+          is_dpop = true;
+        }
+      } else {
+        GELOGW("Get original type of framework op[%s] failed.", node->GetName().c_str());
+      }
+    }
+    bool ret = model_serialize_imp.SerializeNode(node, &op_def, is_framework_op);
+    op_def.set_id(0);
+    if (is_dpop) {
+      node->GetOpDesc()->SetName(origin_dpop_name);
+    }
+    if (!ret) {
+      GELOGW("Fail to serialize node[%].", node->GetName().c_str());
+      return INTERNAL_ERROR;
+    }
+    string prototxt;
+    ret = google::protobuf::TextFormat::PrintToString(op_def, &prototxt);
+    if (!ret) {
+      GELOGW("Print OpDef to string failed.");
+      hash_map.clear();
+      return INTERNAL_ERROR;
+    }
+    size_t hash_code = node_hash(prototxt);
+    if (is_dpop) {
+      hash_map[kDpopFunction] = hash_code;
+    } else {
+      hash_map[node->GetName()] = hash_code;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetComputeGraphHash(size_t &hash) const {
+  proto::GraphDef graph_proto;
+  ModelSerializeImp model_serialize_imp;
+  // The name of compute graph may be generated randomly, so replace it temporarily.
+  const string origin_name = compute_graph_->GetName();
+  compute_graph_->SetName(kGraphName);
+  bool serialize_ret = model_serialize_imp.SerializeGraph(compute_graph_, &graph_proto);
+  graph_proto.clear_op();
+  if (!serialize_ret) {
+    GELOGW("Serialize graph failed.");
+    hash = 0;
+    return INTERNAL_ERROR;
+  }
+  compute_graph_->SetName(origin_name);
+  // Generate proto text of GraphDef
+  string prototxt;
+  bool print_ret = google::protobuf::TextFormat::PrintToString(graph_proto, &prototxt);
+  if (!print_ret) {
+    GELOGW("Print GraphDef to string failed.");
+    hash = 0;
+    return INTERNAL_ERROR;
+  }
+  // Get the hash code of proto text
+  std::hash<string> graph_hash;
+  hash = graph_hash(prototxt);
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::SaveJsonToFile(const string &file_name, const Json &json) const {
+  if (!is_cache_path_valid_for_output) {
+    GELOGW("Invalid cache path.");
+    return PARAM_INVALID;
+  }
+  // Check whether the manifest exists, if not, create it.
+  string real_path = RealPath(cache_path_.c_str());
+  if (real_path.empty()) {
+    GELOGW("File path is invalid. please check cache path: %s", cache_path_.c_str());
+    return FAILED;
+  }
+  const string path = cache_path_ + file_name;
+  const int FILE_AUTHORITY = 0600;
+  int fd = open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, FILE_AUTHORITY);
+  if (fd < 0) {
+    GELOGW("Fail to open the file: %s.", path.c_str());
+    return INTERNAL_ERROR;
+  }
+  if (close(fd) != 0) {
+    GELOGW("Fail to close the file: %s.", path.c_str());
+    return INTERNAL_ERROR;
+  }
+
+  // Write json into cache file
+  ofstream ofs;
+  ofs.open(path);
+  if (!ofs.is_open()) {
+    GELOGW("Fail to open the file: %s.", path.c_str());
+    return INTERNAL_ERROR;
+  }
+  ofs << json << std::endl;
+  ofs.close();
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::LoadJsonFromFile(const string &file_name, Json &json) const {
+  if (!json.is_null()) {
+    GELOGW("Input param json type should be null.");
+    return PARAM_INVALID;
+  }
+  string real_path = RealPath(cache_path_.c_str());
+  if (real_path.empty()) {
+    GELOGW("File path is invalid. please check cache path: %s", cache_path_.c_str());
+    return FAILED;
+  }
+  const string path = cache_path_ + file_name;
+  if (!CheckInputPathValid(path)) {
+    GELOGW("Invalid cache path for input:%s.", path.c_str());
+    return FAILED;
+  }
+  string cache_real_path = RealPath(path.c_str());
+  if (real_path.empty()) {
+    GELOGI("File[%s] is not found.", path.c_str());
+    return FAILED;
+  }
+  // Read json from cache file
+  ifstream ifs;
+  ifs.open(path);
+  if (!ifs.is_open()) {
+    GELOGW("Fail to open the file: %s.", path.c_str());
+    return INTERNAL_ERROR;
+  }
+  ifs >> json;
+  if (!json.is_object()) {
+    GELOGW("Fail to load the json file: %s.", path.c_str());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::SaveCacheInfoToCache() const {
+  // Generate cache json
+  // example: {"edgeNum":6,"nodeNum":7,"graphCache":134714827475991356}
+  Json cache_json;
+  try {
+    cache_json[kNodeNum] = compute_graph_->GetDirectNodesSize();
+    size_t edge_num = 0;
+    for (const auto &node : compute_graph_->GetDirectNode()) {
+      for (const auto &anchor : node->GetAllInAnchors()) {
+        edge_num += anchor->GetPeerAnchors().size();
+      }
+    }
+    cache_json[kEdgeNum] = edge_num;
+    size_t hash = 0;
+    auto ret = GetComputeGraphHash(hash);
+    if (ret != SUCCESS) {
+      GELOGW("Error occur when generate graph hash code.");
+      return ret;
+    }
+    cache_json[kGraphHash] = hash;
+    Json nodes_hash_json;
+    ret = GetNodesHashMapJson(nodes_hash_json);
+    if (ret != SUCCESS) {
+      GELOGW("Error occur when generate nodes hash code.");
+      return ret;
+    }
+    cache_json[kNodeHash] = nodes_hash_json;
+  } catch (const std::exception &e) {
+    GELOGW("Fail to generate cache info json. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  string cache_manifest = to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kManifestSuffix;
+
+  auto ret = SaveJsonToFile(cache_manifest, cache_json);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to save cache info to json file, path: %s.", cache_path_.c_str());
+    return ret;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetCacheInfo(CacheInfo &cache_info) const {
+  string cache_manifest = to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kManifestSuffix;
+  Json cache_json;
+  if (LoadJsonFromFile(cache_manifest, cache_json) != SUCCESS) {
+    GELOGW("Fail to load json from cache file: %s", cache_manifest.c_str());
+    return INTERNAL_ERROR;
+  }
+  if (!cache_json.is_object()) {
+    GELOGW("Manifest should be a json object");
+    return INTERNAL_ERROR;
+  }
+  try {
+    cache_info.node_num = cache_json[kNodeNum];
+    cache_info.edge_num = cache_json[kEdgeNum];
+    cache_info.graph_hash = cache_json[kGraphHash];
+    Json nodes_hash_json = cache_json[kNodeHash];
+    if (!(nodes_hash_json.is_null() || nodes_hash_json.is_array())) {
+      GELOGW("Nodes hash in cache be null or array.");
+      return FAILED;
+    }
+    for (const auto &iter : nodes_hash_json) {
+      cache_info.nodes_hash[iter[kName].get<std::string>()] = iter[kHash].get<size_t>();
+    }
+  } catch (const std::exception &e) {
+    GELOGW("Fail to get info from json file. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+bool ModelCacheHelper::IsAllocatedGraphIdSameAsCache(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return false;
+  }
+  // Compare allocated graph id info between json and VarManager
+  std::unordered_map<std::string, uint32_t> allocated_graph_id;
+  auto ret = ParseAllocatedGraphIdFromJson(json, allocated_graph_id);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse AllocatedGraphId from Json.");
+    return false;
+  }
+  for (const auto &iter : allocated_graph_id) {
+    uint32_t graph_id = 0;
+    ret = VarManager::Instance(session_id_)->GetAllocatedGraphId(iter.first, graph_id);
+    if (ret != SUCCESS) {
+      GELOGW("Fail to find allocated graph id of var[%s].", iter.first.c_str());
+      return false;
+    }
+    if (graph_id != iter.second) {
+      GELOGW("The allocated graph id of variable[%s] in cache is different from VarManager.", iter.first.c_str());
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ModelCacheHelper::IsNodeHashSameAsCache(const map<std::string, size_t> &hash_map) const {
+  map<std::string, size_t> cur_hash_map;
+  GetNodesHash(cur_hash_map);
+  if (hash_map.size() != cur_hash_map.size()) {
+    GELOGI("The number of hash code is different from cache info.");
+    return false;
+  }
+  for (const auto &iter : cur_hash_map) {
+    if (hash_map.count(iter.first) == 0) {
+      GELOGI("Node[%s] is not found in cache info.", iter.first.c_str());
+      return false;
+    }
+    if (hash_map.at(iter.first) != iter.second) {
+      GELOGI("The hash code of node[%s] is different from cache info.", iter.first.c_str());
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ModelCacheHelper::IsMemResourceSameAsCache(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return false;
+  }
+  // Compare var mem size info between json and VarManager
+  std::map<rtMemType_t, int64_t> var_mem_size;
+  auto ret = ParseMemResourceFromJson(json, var_mem_size);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse MemResource from Json.");
+    return false;
+  }
+  for (const auto &iter : var_mem_size) {
+    int64_t mem_size = VarManager::Instance(session_id_)->GetVarMemSize(iter.first);
+    if (mem_size != iter.second) {
+      GELOGW("The var mem size of memory_type[%u] in cache is different from VarManager.", iter.first);
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ModelCacheHelper::IsChangedGraphIdSameAsCache(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return false;
+  }
+  // Compare variable changed graph id info between json and VarManager
+  std::unordered_map<std::string, uint32_t> changed_graph_id;
+  auto ret = ParseChangedGraphIdFromJson(json, changed_graph_id);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse ChangedGraphId from Json.");
+    return false;
+  }
+  for (const auto &iter : changed_graph_id) {
+    uint32_t graph_id = 0;
+    ret = VarManager::Instance(session_id_)->GetChangedGraphId(iter.first, graph_id);
+    if (ret != SUCCESS) {
+      GELOGW("Fail to find changed graph id of var[%s].", iter.first.c_str());
+      return false;
+    }
+    if (graph_id != iter.second) {
+      GELOGW("The changed graph id of variable[%s] in cache is different from VarManager.", iter.first.c_str());
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ModelCacheHelper::IsCurVarTensorDescSameAsCache(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return false;
+  }
+  // Compare variable tensor desc info between json and VarManager
+  std::unordered_map<std::string, ge::GeTensorDesc> cur_var_tensor_desc;
+  auto ret = ParseCurVarTensorDescMapFromJson(json, cur_var_tensor_desc);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse CurVarTensorDesc from Json.");
+    return false;
+  }
+  for (const auto &iter : cur_var_tensor_desc) {
+    GeTensorDesc tensor_desc;
+    ret = VarManager::Instance(session_id_)->GetCurVarDesc(iter.first, tensor_desc);
+    if (ret != SUCCESS) {
+      GELOGW("Fail to find tensor desc of var[%s].", iter.first.c_str());
+      return false;
+    }
+    uint32_t l_real_dim_cnt = 0;
+    uint32_t r_real_dim_cnt = 0;
+    TensorUtils::GetRealDimCnt(tensor_desc, l_real_dim_cnt);
+    TensorUtils::GetRealDimCnt(iter.second, r_real_dim_cnt);
+    if ((tensor_desc.GetDataType() != iter.second.GetDataType()) ||
+        (tensor_desc.GetOriginDataType() != iter.second.GetOriginDataType()) ||
+        (tensor_desc.GetFormat() != iter.second.GetFormat()) ||
+        (tensor_desc.GetOriginFormat() != iter.second.GetOriginFormat()) ||
+        (tensor_desc.GetShape().ToString() != iter.second.GetShape().ToString()) ||
+        (tensor_desc.GetOriginShape().ToString() != iter.second.GetOriginShape().ToString()) ||
+        (l_real_dim_cnt != r_real_dim_cnt)) {
+      GELOGW("The var tensor desc of variable[%s] in cache is different from VarManager.", iter.first.c_str());
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ModelCacheHelper::IsVarAddrMgrMapSameAsCache(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return false;
+  }
+  // Compare variable address info between json and VarManager
+  std::vector<std::pair<std::string, VarAddrMgr>> var_addr_mgr_vector;
+  std::unordered_set<uint64_t> var_offset_set;
+  auto ret = ParseVarAddrMgrMapFromJson(json, var_addr_mgr_vector, var_offset_set);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse VarAddrMgrMap from Json.");
+    return false;
+  }
+  for (const auto &iter : var_addr_mgr_vector) {
+    uint8_t *dev_ptr = nullptr;
+    rtMemType_t memory_type;
+    ret = VarManager::Instance(session_id_)->GetVarAddr(iter.first, iter.second.tensor_desc, &dev_ptr, memory_type);
+    if (ret != SUCCESS) {
+      GELOGW("Fail to find tensor desc of var[%s].", iter.first.c_str());
+      return false;
+    }
+    // Compare memory type and logic address
+    if (iter.second.memory_type != memory_type || iter.second.address != dev_ptr) {
+      GELOGW("The VarAddrMgr of variable[%s] in cache is different from VarManager.", iter.first.c_str());
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ModelCacheHelper::IsBroadcastInfoSameAsCache(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return false;
+  }
+  // Compare broadcast info between json and VarManager
+  std::unordered_map<std::string, VarBroadCastInfo> var_broadcast_info;
+  auto ret = ParseBroadcastInfoFromJson(json, var_broadcast_info);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse BroadcastInfo from Json.");
+    return false;
+  }
+  for (const auto &iter : var_broadcast_info) {
+    VarBroadCastInfo broadcast_info;
+    if (VarManager::Instance(session_id_)->GetBroadCastInfo(graph_id_, iter.first, broadcast_info) != SUCCESS) {
+      GELOGW("Fail to find broadcast info of var[%s].", iter.first.c_str());
+      return false;
+    }
+    if (iter.second.var_name != broadcast_info.var_name || iter.second.idx != broadcast_info.idx ||
+        iter.second.input_size != broadcast_info.input_size ||
+        iter.second.input_offset != broadcast_info.input_offset ||
+        iter.second.output_size != broadcast_info.output_size ||
+        iter.second.output_offset != broadcast_info.output_offset) {
+      GELOGW("The BroadcastInfo of variable[%s] in cache is different from VarManager.", iter.first.c_str());
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ModelCacheHelper::IsTransRoadsSameAsCache(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return false;
+  }
+  // Compare trans road between json and VarManager
+  std::unordered_map<std::string, std::vector<TransNodeInfo>> trans_roads;
+  auto ret = ParseTransRoadsFromJson(json, trans_roads);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse TransRoads from Json.");
+    return false;
+  }
+  for (const auto &iter : trans_roads) {
+    VarTransRoad *trans_road;
+    trans_road = VarManager::Instance(session_id_)->GetTransRoad(iter.first);
+    if (trans_road == nullptr) {
+      GELOGW("Fail to find trans road of var[%s].", iter.first.c_str());
+      return false;
+    }
+    if (trans_road->size() != iter.second.size()) {
+      GELOGW("The TransRoad of variable[%s] in cache is different from VarManager.", iter.first.c_str());
+      return false;
+    }
+    // Compare every trans node in trans road.
+    for (size_t idx = 0; idx < trans_road->size(); idx += 1) {
+      if (!(trans_road->at(idx).node_type == iter.second.at(idx).node_type &&
+            trans_road->at(idx).input == iter.second.at(idx).input &&
+            trans_road->at(idx).output == iter.second.at(idx).output)) {
+        GELOGW("The TransRoad of variable[%s] in cache is different from VarManager.", iter.first.c_str());
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool ModelCacheHelper::IsVarManagerParamSameAsCache(Json &json) const {
+  if (!json.is_object()) {
+    GELOGW("Input param json type should be object.");
+    return false;
+  }
+  try {
+    if (json[kSessionId].get<uint64_t>() != session_id_) {
+      GELOGW("Check VarManager cache failed.[sessionId]");
+      return false;
+    }
+    if (json[kDeviceId].get<uint32_t>() != VarManager::Instance(session_id_)->DeviceId()) {
+      GELOGW("Check VarManager cache failed.[deviceId]");
+      return false;
+    }
+    if (json[kJobId].get<uint64_t>() != VarManager::Instance(session_id_)->JobId()) {
+      GELOGW("Check VarManager cache failed.[jobId]");
+      return false;
+    }
+    if (json[kGraphMemMaxSize].get<size_t>() != VarManager::Instance(session_id_)->GetGraphMemoryMaxSize()) {
+      GELOGW("Check VarManager cache failed.[graphMemMaxSize]");
+      return false;
+    }
+    if (json[kVarMemMaxSize].get<size_t>() != VarManager::Instance(session_id_)->GetVarMemMaxSize()) {
+      GELOGW("Check VarManager cache failed.[varMemMaxSize]");
+      return false;
+    }
+    if (json[kVarMemLogicBase].get<size_t>() != VarManager::Instance(session_id_)->GetVarMemLogicBase()) {
+      GELOGW("Check VarManager cache failed.[varMemLogicBase]");
+      return false;
+    }
+    if (json[kUseMaxMemSize].get<size_t>() != VarManager::Instance(session_id_)->GetUseMaxMemorySize()) {
+      GELOGW("Check VarManager cache failed.[useMaxMemSize]");
+      return false;
+    }
+  } catch (const std::exception &e) {
+    GELOGW("Fail to check VarManager json. Error message: %s", e.what());
+    return false;
+  }
+  return true;
+}
+
+bool ModelCacheHelper::IsVarManagerSameAsCache(Json &json) const {
+  if (!json.is_object()) {
+    GELOGW("Input param json type should be object.");
+    return false;
+  }
+  try {
+    if (!IsVarManagerParamSameAsCache(json)) {
+      GELOGW("Check VarManager cache failed.[Param]");
+      return false;
+    }
+    Json mem_resource_json = move(json[kMemResourceMap]);
+    auto ret = IsMemResourceSameAsCache(mem_resource_json);
+    if (!ret) {
+      GELOGW("Check VarManager cache failed.[MemResource]");
+      return false;
+    }
+    Json var_resource_json = move(json[kVarResource]);
+    ret = IsAllocatedGraphIdSameAsCache(var_resource_json[kAllocatedGraphId]);
+    if (!ret) {
+      GELOGW("Check VarManager cache failed.[AllocatedGraphId]");
+      return false;
+    }
+    ret = IsChangedGraphIdSameAsCache(var_resource_json[kChangedGraphId]);
+    if (!ret) {
+      GELOGW("Check VarManager cache failed.[ChangedGraphId]");
+      return false;
+    }
+    ret = IsBroadcastInfoSameAsCache(var_resource_json[kVarBroadcastInfo]);
+    if (!ret) {
+      GELOGW("Check VarManager cache failed.[VarBroadcastInfo]");
+      return false;
+    }
+    ret = IsCurVarTensorDescSameAsCache(var_resource_json[kCurVarTensorDescMap]);
+    if (!ret) {
+      GELOGW("Check VarManager cache failed.[CurVarTensorDesc]");
+      return false;
+    }
+    ret = IsVarAddrMgrMapSameAsCache(var_resource_json[kVarAddrMgrMap]);
+    if (!ret) {
+      GELOGW("Check VarManager cache failed.[VarAddrMgrMap]");
+      return false;
+    }
+    ret = IsTransRoadsSameAsCache(var_resource_json[kTransRoads]);
+    if (!ret) {
+      GELOGW("Check VarManager cache failed.[TransRoads]");
+      return false;
+    }
+  } catch (const std::exception &e) {
+    GELOGW("Fail to check VarManager json. Error message: %s", e.what());
+    return false;
+  }
+  return true;
+}
+
+Status ModelCacheHelper::RecoverMemResource(const Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  std::map<rtMemType_t, int64_t> var_mem_size;
+  auto ret = ParseMemResourceFromJson(json, var_mem_size);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse MemResource from Json.");
+    return ret;
+  }
+  for (const auto &iter : var_mem_size) {
+    ret = VarManager::Instance(session_id_)->UpdateVarMemSize(iter.first, iter.second);
+    if (ret != SUCCESS) {
+      GELOGW("Fail to recover var mem size.");
+      return ret;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::RecoverAllocatedGraphId(const Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  std::unordered_map<std::string, uint32_t> allocated_graph_id;
+  auto ret = ParseAllocatedGraphIdFromJson(json, allocated_graph_id);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse AllocatedGraphId from Json.");
+    return ret;
+  }
+  for (const auto &iter : allocated_graph_id) {
+    ret = VarManager::Instance(session_id_)->SetAllocatedGraphId(iter.first, iter.second);
+    if (ret != SUCCESS) {
+      GELOGW("Fail to recover allocated graph id.");
+      return ret;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::RecoverChangedGraphId(const Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  std::unordered_map<std::string, uint32_t> changed_graph_id;
+  auto ret = ParseChangedGraphIdFromJson(json, changed_graph_id);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse AllocatedGraphId from Json.");
+    return ret;
+  }
+  for (const auto &iter : changed_graph_id) {
+    ret = VarManager::Instance(session_id_)->SetChangedGraphId(iter.first, iter.second);
+    if (ret != SUCCESS) {
+      GELOGW("Fail to recover changed graph id.");
+      return ret;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::RecoverVarAddrAndTensorDesc(const Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  std::vector<std::pair<std::string, VarAddrMgr>> var_addr_mgr_vector;
+  std::unordered_set<uint64_t> var_offset_set;
+  auto ret = ParseVarAddrMgrMapFromJson(json, var_addr_mgr_vector, var_offset_set);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse VarAddrMgrMap from Json.");
+    return ret;
+  }
+  for (const auto &iter : var_addr_mgr_vector) {
+    const VarAddrMgr &tensor_addr_mgr = iter.second;
+    const bool var_exist = VarManager::Instance(session_id_)->IsVarExist(iter.first, tensor_addr_mgr.tensor_desc);
+    // SaveVarVddr if var does not exist, the logic address will be recorded by VarManager
+    if (!var_exist) {
+      auto logic_address = reinterpret_cast<uint64_t>(tensor_addr_mgr.address);
+      auto offset = (tensor_addr_mgr.offset);
+      // Check logic address and offset
+      if (logic_address - offset != VarManager::Instance(session_id_)->GetVarMemLogicBase()) {
+        GELOGW("Check logic_address[%u] and offset [%u] of %s failed, var mem logic base is %u, abandon", logic_address,
+               offset, iter.first.c_str(), VarManager::Instance(session_id_)->GetVarMemLogicBase());
+        return PARAM_INVALID;
+      }
+      // Offset is needed by SaveVarVddr instead of logic address
+      ret = VarManager::Instance(session_id_)
+              ->SaveVarAddr(iter.first, tensor_addr_mgr.tensor_desc, reinterpret_cast<uint8_t *>(offset),
+                            tensor_addr_mgr.memory_type);
+      if (ret != SUCCESS) {
+        GELOGW("Fail to recover VarAddr or TensorDesc of var[%s].", iter.first.c_str());
+        return ret;
+      }
+    }
+    // SetVarAddr to update cur_var_tensor_desc_map_
+    ret = VarManager::Instance(session_id_)
+            ->SetVarAddr(iter.first, tensor_addr_mgr.tensor_desc, tensor_addr_mgr.address, tensor_addr_mgr.memory_type);
+    if (ret != SUCCESS) {
+      GELOGW("Fail to recover VarAddr or TensorDesc desc of var[%s].", iter.first.c_str());
+      return ret;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::RecoverBroadcastInfo(const Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  std::unordered_map<std::string, VarBroadCastInfo> var_broadcast_info;
+  auto ret = ParseBroadcastInfoFromJson(json, var_broadcast_info);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse BroadcastInfo from Json.");
+    return ret;
+  }
+  for (const auto &iter : var_broadcast_info) {
+    VarBroadCastInfo broadcast_info;
+    ret = VarManager::Instance(session_id_)->SaveBroadCastInfo(graph_id_, iter.second);
+    if (ret != SUCCESS) {
+      GELOGW("Fail to recover broadcast info of var[%s].", iter.first.c_str());
+      return ret;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::RecoverTransRoads(const Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  std::unordered_map<std::string, std::vector<TransNodeInfo>> trans_roads;
+  auto ret = ParseTransRoadsFromJson(json, trans_roads);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to parse TransRoads from Json.");
+    return ret;
+  }
+  for (const auto &iter : trans_roads) {
+    ret = VarManager::Instance(session_id_)->SetTransRoad(iter.first, iter.second);
+    if (ret != SUCCESS) {
+      GELOGW("Fail to find trans road of var[%s].", iter.first.c_str());
+      return ret;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::TensorDescToJson(const GeTensorDesc &ge_tensor_desc, Json &json) {
+  if (!(json.is_null() || json.is_object())) {
+    GELOGW("Input param json type should be null or object.");
+    return PARAM_INVALID;
+  }
+  try {
+    json[kDataType] = static_cast<int>(ge_tensor_desc.GetDataType());
+    json[kOriginDataType] = static_cast<int>(ge_tensor_desc.GetOriginDataType());
+    json[kLayout] = static_cast<int>(ge_tensor_desc.GetFormat());
+    json[kOriginLayout] = static_cast<int>(ge_tensor_desc.GetOriginFormat());
+    json[kShape] = ge_tensor_desc.GetShape().GetDims();
+    json[kOriginShape] = ge_tensor_desc.GetOriginShape().GetDims();
+    uint32_t real_dim_cnt = 0;
+    (void)TensorUtils::GetRealDimCnt(ge_tensor_desc, real_dim_cnt);  // [No need to check value]
+    json[kRealDimCnt] = real_dim_cnt;
+  } catch (const std::exception &e) {
+    GELOGW("Fail to trans GeTensorDesc to json. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::JsonToTensorDesc(const Json &json, ge::GeTensorDesc &ge_tensor_desc) {
+  if (!json.is_object()) {
+    GELOGW("Input param json type should be object.");
+    return PARAM_INVALID;
+  }
+  try {
+    ge_tensor_desc.SetDataType(static_cast<DataType>(json[kDataType].get<int>()));
+    ge_tensor_desc.SetOriginDataType(static_cast<DataType>(json[kOriginDataType].get<int>()));
+    ge_tensor_desc.SetFormat(static_cast<Format>(json[kLayout].get<int>()));
+    ge_tensor_desc.SetOriginFormat(static_cast<Format>(json[kOriginLayout].get<int>()));
+    GeShape shape(json[kShape].get<std::vector<int64_t>>());
+    ge_tensor_desc.SetShape(shape);
+    GeShape origin_shape(json[kOriginShape].get<std::vector<int64_t>>());
+    ge_tensor_desc.SetOriginShape(origin_shape);
+    auto real_dim_cnt = json[kRealDimCnt].get<uint32_t>();
+    (void)TensorUtils::SetRealDimCnt(ge_tensor_desc, real_dim_cnt);  // [No need to check value]
+  } catch (const std::exception &e) {
+    GELOGW("Fail to trans Json to GeTensorDesc. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetNodesHashMapJson(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  map<std::string, size_t> hash_map;
+  GetNodesHash(hash_map);
+  for (const auto &iter : hash_map) {
+    Json node_hash_json;
+    try {
+      node_hash_json[kName] = iter.first;
+      node_hash_json[kHash] = iter.second;
+      json.emplace_back(move(node_hash_json));
+    } catch (const std::exception &e) {
+      GELOGW("Fail to trans node cache to json. Error message: %s", e.what());
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetMemResourceMap(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  const auto total_size = VarManager::Instance(session_id_)->GetVarMemMaxSize();
+  const auto var_mem_size = VarManager::Instance(session_id_)->GetVarMemSize(RT_MEMORY_HBM);
+  Json mem_resource_json;
+  try {
+    mem_resource_json[kMemType] = RT_MEMORY_HBM;
+    mem_resource_json[kTotalSize] = total_size;
+    mem_resource_json[kVarMemSize] = var_mem_size;
+    json.emplace_back(move(mem_resource_json));
+  } catch (const std::exception &e) {
+    GELOGW("Fail to trans MemResourceMap to json. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetVarAddrMgrMapJson(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  std::unordered_map<std::string, VarAddrMgr> var_addr_mgr_map;
+  VarManager::Instance(session_id_)->GetAllVarAddrMgr(var_addr_mgr_map);
+  try {
+    for (const auto &iter : var_addr_mgr_map) {
+      Json var_addr_json;
+      string name;
+      GetVarNameFromVarKey(iter.first, iter.second.tensor_desc, name);
+      var_addr_json[kName] = name;
+      var_addr_json[kAddress] = reinterpret_cast<uint64_t>(iter.second.address);
+      var_addr_json[kMemoryType] = iter.second.memory_type;
+      var_addr_json[kOffset] = iter.second.offset;
+
+      // Copy tensor desc to json.
+      Json tensor_desc_json;
+      auto ret = TensorDescToJson(iter.second.tensor_desc, tensor_desc_json);
+      if (ret != SUCCESS) {
+        GELOGW("Fail to trans tensor desc to json.");
+        return INTERNAL_ERROR;
+      }
+      var_addr_json[kTensorDesc] = move(tensor_desc_json);
+
+      json.emplace_back(move(var_addr_json));
+    }
+  } catch (const std::exception &e) {
+    GELOGW("Fail to trans VarAddrMgrMap to json. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetCurVarTensorDescMapJson(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  try {
+    for (const auto &name : var_names_) {
+      Json cur_tensor_desc_json;
+      GeTensorDesc tensor_desc;
+      auto ret = VarManager::Instance(session_id_)->GetCurVarDesc(name, tensor_desc);
+      if (ret != SUCCESS) {
+        GELOGI("Get variable[%s] current tensor desc failed. It will be skipped.", name.c_str());
+        continue;
+      }
+      cur_tensor_desc_json[kName] = name;
+
+      Json tensor_desc_json;
+      ret = TensorDescToJson(tensor_desc, tensor_desc_json);
+      if (ret != SUCCESS) {
+        GELOGW("Fail to trans tensor desc to json.");
+        return INTERNAL_ERROR;
+      }
+      cur_tensor_desc_json[kTensorDesc] = move(tensor_desc_json);
+      json.emplace_back(move(cur_tensor_desc_json));
+    }
+  } catch (const std::exception &e) {
+    GELOGW("Fail to trans CurVarTensorDescMap to json. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetTransRoadsJson(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  try {
+    for (const auto &name : var_names_) {
+      auto trans_road = VarManager::Instance(session_id_)->GetTransRoad(name);
+      if (trans_road == nullptr) {
+        continue;
+      }
+      // Json object, variable name and trans road
+      Json trans_road_map_json;
+      trans_road_map_json[kName] = name;
+
+      Json trans_road_json;
+      Status ret;
+      // Add nodes' info to json
+      for (const auto &trans_node_info : *trans_road) {
+        Json trans_node_info_json;
+        trans_node_info_json[kNodeType] = trans_node_info.node_type;
+        Json input_tensor_desc_json;
+        ret = TensorDescToJson(trans_node_info.input, input_tensor_desc_json);
+        if (ret != SUCCESS) {
+          GELOGW("Fail to trans tensor desc to json.");
+          return INTERNAL_ERROR;
+        }
+        trans_node_info_json[kInputTensorDesc] = move(input_tensor_desc_json);
+        Json output_tensor_desc_json;
+        ret = TensorDescToJson(trans_node_info.output, output_tensor_desc_json);
+        if (ret != SUCCESS) {
+          GELOGW("Fail to trans tensor desc to json.");
+          return INTERNAL_ERROR;
+        }
+        trans_node_info_json[kOutputTensorDesc] = move(output_tensor_desc_json);
+        trans_road_json.emplace_back(move(trans_node_info_json));
+      }
+      trans_road_map_json[kTransRoad] = move(trans_road_json);
+      json.emplace_back(move(trans_road_map_json));
+    }
+  } catch (const std::exception &e) {
+    GELOGW("Fail to trans VarToTransRoad to json. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetChangedGraphIdJson(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  for (const auto &name : var_names_) {
+    uint32_t changed_graph_id = 0;
+    Status ret = VarManager::Instance(session_id_)->GetChangedGraphId(name, changed_graph_id);
+    if (ret != SUCCESS) {
+      continue;
+    }
+    Json name_and_changed_graph_id;
+    try {
+      name_and_changed_graph_id[kName] = name;
+      name_and_changed_graph_id[kGraphId] = changed_graph_id;
+      json.emplace_back(move(name_and_changed_graph_id));
+    } catch (const std::exception &e) {
+      GELOGW("Fail to trans ChangedGraphId to json. Error message: %s", e.what());
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetAllocatedGraphIdJson(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  for (const auto &name : var_names_) {
+    uint32_t allocated_graph_id = 0;
+    Status ret = VarManager::Instance(session_id_)->GetAllocatedGraphId(name, allocated_graph_id);
+    if (ret != SUCCESS) {
+      continue;
+    }
+    Json name_and_allocated_graph_id;
+    try {
+      name_and_allocated_graph_id[kName] = name;
+      name_and_allocated_graph_id[kGraphId] = allocated_graph_id;
+      json.emplace_back(move(name_and_allocated_graph_id));
+    } catch (const std::exception &e) {
+      GELOGW("Fail to trans AllocatedGraphId to json. Error message: %s", e.what());
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetBroadcastInfoJson(Json &json) const {
+  if (!(json.is_null() || json.is_array())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  for (const auto &name : var_names_) {
+    VarBroadCastInfo var_broadcast_info;
+    Status ret = VarManager::Instance(session_id_)->GetBroadCastInfo(graph_id_, name, var_broadcast_info);
+    if (ret != SUCCESS) {
+      continue;
+    }
+    Json var_broadcast_info_json;
+    try {
+      var_broadcast_info_json[kName] = name;
+      var_broadcast_info_json[kBroadcastName] = var_broadcast_info.broadcast_name;
+      var_broadcast_info_json[kIdx] = var_broadcast_info.idx;
+      var_broadcast_info_json[kInputOffset] = var_broadcast_info.input_offset;
+      var_broadcast_info_json[kInputSize] = var_broadcast_info.input_size;
+      var_broadcast_info_json[kOutputOffset] = var_broadcast_info.output_offset;
+      var_broadcast_info_json[kOutputSize] = var_broadcast_info.output_size;
+      json.emplace_back(move(var_broadcast_info_json));
+    } catch (const std::exception &e) {
+      GELOGW("Fail to trans VarBroadcastInfo to json. Error message: %s", e.what());
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetVarResourceJson(Json &json) const {
+  if (!(json.is_null() || json.is_object())) {
+    GELOGW("Input param json type should be null or object.");
+    return PARAM_INVALID;
+  }
+  Json var_addr_mgr_map_json;
+  Status ret = GetVarAddrMgrMapJson(var_addr_mgr_map_json);
+  if (ret != SUCCESS) {
+    GELOGW("GetVarAddrMgrMapJson failed.");
+    return INTERNAL_ERROR;
+  }
+
+  Json cur_var_tensor_desc_map_json;
+  ret = GetCurVarTensorDescMapJson(cur_var_tensor_desc_map_json);
+  if (ret != SUCCESS) {
+    GELOGW("GetCurVarTensorDescMapJson failed.");
+    return INTERNAL_ERROR;
+  }
+
+  Json trans_roads_json;
+  ret = GetTransRoadsJson(trans_roads_json);
+  if (ret != SUCCESS) {
+    GELOGW("GetTransRoadsJson failed.");
+    return INTERNAL_ERROR;
+  }
+
+  Json changed_graph_id_json;
+  ret = GetChangedGraphIdJson(changed_graph_id_json);
+  if (ret != SUCCESS) {
+    GELOGW("GetChangedGraphIdJson failed.");
+    return INTERNAL_ERROR;
+  }
+
+  Json allocated_graph_id_json;
+  ret = GetAllocatedGraphIdJson(allocated_graph_id_json);
+  if (ret != SUCCESS) {
+    GELOGW("GetAllocatedGraphIdJson failed.");
+    return INTERNAL_ERROR;
+  }
+
+  Json var_broadcast_info_json;
+  ret = GetBroadcastInfoJson(var_broadcast_info_json);
+  if (ret != SUCCESS) {
+    GELOGW("GetBroadcastInfoJson failed.");
+    return INTERNAL_ERROR;
+  }
+
+  try {
+    json[kVarAddrMgrMap] = move(var_addr_mgr_map_json);
+    json[kCurVarTensorDescMap] = move(cur_var_tensor_desc_map_json);
+    json[kTransRoads] = move(trans_roads_json);
+    json[kChangedGraphId] = move(changed_graph_id_json);
+    json[kAllocatedGraphId] = move(allocated_graph_id_json);
+    json[kVarBroadcastInfo] = move(var_broadcast_info_json);
+  } catch (const exception &e) {
+    GELOGW("Fail to generate VarResource json. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetVarManagerJson(Json &json) const {
+  if (!(json.is_null() || json.is_object())) {
+    GELOGW("Input param json type should be null or object.");
+    return PARAM_INVALID;
+  }
+
+  Json mem_resource_map_json;
+  auto ret = GetMemResourceMap(mem_resource_map_json);
+  if (ret != SUCCESS) {
+    GELOGW("GetMemResourceMap failed.");
+    return INTERNAL_ERROR;
+  }
+
+  Json var_resource_json;
+  ret = GetVarResourceJson(var_resource_json);
+  if (ret != SUCCESS) {
+    GELOGW("GetVarResourceJson failed.");
+    return INTERNAL_ERROR;
+  }
+
+  try {
+    json[kSessionId] = session_id_;
+    json[kDeviceId] = VarManager::Instance(session_id_)->DeviceId();
+    json[kJobId] = VarManager::Instance(session_id_)->JobId();
+    json[kGraphMemMaxSize] = VarManager::Instance(session_id_)->GetGraphMemoryMaxSize();
+    json[kVarMemMaxSize] = VarManager::Instance(session_id_)->GetVarMemMaxSize();
+    json[kVarMemLogicBase] = VarManager::Instance(session_id_)->GetVarMemLogicBase();
+    json[kUseMaxMemSize] = VarManager::Instance(session_id_)->GetUseMaxMemorySize();
+    json[kMemResourceMap] = move(mem_resource_map_json);
+    json[kVarResource] = move(var_resource_json);
+  } catch (const exception &e) {
+    GELOGW("Fail to generate VarManager json. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::SaveVarManagerToCache(bool before_build) const {
+  if (!is_cache_path_valid_for_output) {
+    GELOGW("Invalid cache path.");
+    return FAILED;
+  }
+  Json var_manager_json;
+  auto ret = GetVarManagerJson(var_manager_json);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to generate VarManager json.");
+    return FAILED;
+  }
+  string var_manager_path = to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) +
+                            (before_build ? kBeforeVarManagerSuffix : kAfterVarManagerSuffix);
+  ret = SaveJsonToFile(var_manager_path, var_manager_json);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to save VarManager info to json file, path: %s.", cache_path_.c_str());
+    return ret;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::SaveOmModelToCache(const GeModelPtr &ge_model) const {
+  if (!is_cache_path_valid_for_output) {
+    GELOGW("Invalid cache path.");
+    return FAILED;
+  }
+  string om_path = RealPath(cache_path_.c_str());
+  if (om_path.empty()) {
+    GELOGW("file path is invalid. please check path om: %s", cache_path_.c_str());
+    return FAILED;
+  }
+  string cache_om_path = cache_path_;
+  cache_om_path += (to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kOmSuffix);
+  GELOGI("SaveOmModelToCache: start to save om model : %s", cache_om_path.c_str());
+  ModelHelper model_helper;
+  SaveParam save_param;
+  ModelBufferData model;
+  Status ret = model_helper.SaveToOmModel(ge_model, save_param, cache_om_path, model);
+  if (ret != SUCCESS) {
+    GELOGW("SaveOmModelToCache: save mode failed. ret = %u", ret);
+    return ret;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::ParseMemResourceFromJson(const Json &json, map<rtMemType_t, int64_t> &mem_resource) {
+  if (!(json.is_array() || json.is_null())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  mem_resource.clear();
+  for (const Json &mem_resource_json : json) {
+    MemResource var_addr_mgr;
+    try {
+      rtMemType_t mem_type = mem_resource_json[kMemType].get<rtMemType_t>();
+      uint64_t var_mem_size = mem_resource_json[kVarMemSize].get<int64_t>();
+      mem_resource[mem_type] = var_mem_size;
+    } catch (const exception &e) {
+      GELOGW("Fail to trans Json to MemResource. Error message: %s", e.what());
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::ParseVarAddrMgrMapFromJson(
+  const Json &json, std::vector<std::pair<std::string, VarAddrMgr>> &var_addr_mgr_vector,
+  std::unordered_set<uint64_t> &var_offset_set) {
+  if (!(json.is_array() || json.is_null())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  var_addr_mgr_vector.clear();
+  var_offset_set.clear();
+  for (const Json &var_addr_json : json) {
+    VarAddrMgr var_addr_mgr;
+    try {
+      auto logic_address = var_addr_json[kAddress].get<uint64_t>();
+      auto address = reinterpret_cast<uint8_t *>(logic_address);
+      var_addr_mgr.address = address;
+      var_addr_mgr.offset = var_addr_json[kOffset].get<uint64_t>();
+      var_addr_mgr.memory_type = var_addr_json[kMemoryType].get<rtMemType_t>();
+      auto ret = JsonToTensorDesc(var_addr_json[kTensorDesc], var_addr_mgr.tensor_desc);
+      if (ret != SUCCESS) {
+        GELOGW("Fail to trans json to tensor desc.");
+        return ret;
+      }
+      var_addr_mgr_vector.emplace_back(var_addr_json[kName].get<string>(), move(var_addr_mgr));
+      var_offset_set.insert(logic_address);
+    } catch (const exception &e) {
+      GELOGW("Fail to trans Json to VarAddrMgr. Error message: %s", e.what());
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::ParseCurVarTensorDescMapFromJson(
+  const Json &json, std::unordered_map<std::string, ge::GeTensorDesc> &cur_var_tensor_desc_map) {
+  if (!(json.is_array() || json.is_null())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  cur_var_tensor_desc_map.clear();
+  for (const Json &tensor_desc_json : json) {
+    GeTensorDesc tensor_desc;
+    try {
+      auto ret = JsonToTensorDesc(tensor_desc_json[kTensorDesc], tensor_desc);
+      if (ret != SUCCESS) {
+        GELOGW("Fail to trans json to tensor desc.");
+        return ret;
+      }
+      cur_var_tensor_desc_map[tensor_desc_json[kName].get<string>()] = move(tensor_desc);
+    } catch (const exception &e) {
+      GELOGW("Fail to trans Json to VarAddrMgr. Error message: %s", e.what());
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::ParseTransRoadsFromJson(
+  const Json &json, std::unordered_map<std::string, std::vector<TransNodeInfo>> &trans_roads) {
+  if (!(json.is_array() || json.is_null())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  trans_roads.clear();
+  try {
+    for (const Json &name_trans_road_json : json) {
+      const Json &trans_road_json = name_trans_road_json[kTransRoad];
+      if (!(trans_road_json.is_array() || trans_road_json.is_null())) {
+        GELOGW("%s json type should be null or object.", kTransRoad);
+        return PARAM_INVALID;
+      }
+      vector<TransNodeInfo> trans_road;
+      for (const Json &trans_node_json : trans_road_json) {
+        TransNodeInfo trans_node_info;
+        trans_node_info.node_type = trans_node_json[kNodeType];
+        GeTensorDesc input_tensor_desc;
+        auto ret = JsonToTensorDesc(trans_node_json[kInputTensorDesc], input_tensor_desc);
+        if (ret != SUCCESS) {
+          GELOGW("Fail to trans json to tensor desc.");
+          return ret;
+        }
+        trans_node_info.input = move(input_tensor_desc);
+        GeTensorDesc output_tensor_desc;
+        ret = JsonToTensorDesc(trans_node_json[kOutputTensorDesc], output_tensor_desc);
+        if (ret != SUCCESS) {
+          GELOGW("Fail to trans json to tensor desc.");
+          return ret;
+        }
+        trans_node_info.output = move(output_tensor_desc);
+        trans_road.emplace_back(move(trans_node_info));
+      }
+      trans_roads[name_trans_road_json[kName].get<string>()] = move(trans_road);
+    }
+  } catch (const exception &e) {
+    GELOGW("Fail to trans Json to TransRoads. Error message: %s", e.what());
+    return INTERNAL_ERROR;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::ParseChangedGraphIdFromJson(const Json &json,
+                                                     std::unordered_map<std::string, uint32_t> &changed_graph_id) {
+  if (!(json.is_array() || json.is_null())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  changed_graph_id.clear();
+  for (const Json &name_graph_id_json : json) {
+    try {
+      changed_graph_id[name_graph_id_json[kName].get<string>()] = name_graph_id_json[kGraphId].get<uint32_t>();
+    } catch (const exception &e) {
+      GELOGW("Fail to trans Json to changed graph id. Error message: %s", e.what());
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::ParseAllocatedGraphIdFromJson(const Json &json,
+                                                       std::unordered_map<std::string, uint32_t> &allocated_graph_id) {
+  if (!(json.is_array() || json.is_null())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  allocated_graph_id.clear();
+  for (const Json &name_graph_id_json : json) {
+    try {
+      allocated_graph_id[name_graph_id_json[kName].get<string>()] = name_graph_id_json[kGraphId].get<uint32_t>();
+    } catch (const exception &e) {
+      GELOGW("Fail to trans Json to allocated graph id. Error message: %s", e.what());
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::ParseBroadcastInfoFromJson(
+  const Json &json, std::unordered_map<std::string, VarBroadCastInfo> &var_broadcast_info) {
+  if (!(json.is_array() || json.is_null())) {
+    GELOGW("Input param json type should be null or array.");
+    return PARAM_INVALID;
+  }
+  for (const Json &broadcast_info_json : json) {
+    VarBroadCastInfo broadcast_info;
+    try {
+      broadcast_info.var_name = broadcast_info_json[kName].get<string>();
+      broadcast_info.broadcast_name = broadcast_info_json[kBroadcastName].get<string>();
+      broadcast_info.idx = broadcast_info_json[kIdx].get<int>();
+      broadcast_info.input_offset = broadcast_info_json[kInputOffset].get<int64_t>();
+      broadcast_info.input_size = broadcast_info_json[kInputSize].get<uint64_t>();
+      broadcast_info.output_offset = broadcast_info_json[kOutputOffset].get<int64_t>();
+      broadcast_info.output_size = broadcast_info_json[kOutputSize].get<uint64_t>();
+    } catch (const exception &e) {
+      GELOGW("Fail to trans Json to VarBroadCastInfo. Error message: %s", e.what());
+      return INTERNAL_ERROR;
+    }
+    var_broadcast_info[broadcast_info.var_name] = broadcast_info;
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::LoadOmModelFromCache(GeModelPtr &ge_model) const {
+  string cache_om = cache_path_ + to_string(graph_id_) + to_string(graph_id_run_times_[graph_id_]) + kOmSuffix;
+  if (!CheckInputPathValid(cache_om)) {
+    GELOGW("Invalid cache path for input:%s.", cache_om.c_str());
+    return FAILED;
+  }
+  string om_path = RealPath(cache_om.c_str());
+  if (om_path.empty()) {
+    GELOGW("file path is invalid. please check file om: %s", om_path.c_str());
+    return FAILED;
+  }
+  GELOGI("load model data from file: %s", om_path.c_str());
+  Status ret;
+  string key_path;
+  int32_t priority = 0;
+  ModelData model_data;
+  ret = DavinciModelParser::LoadFromFile(om_path.c_str(), key_path.c_str(), priority, model_data);
+  if (ret != SUCCESS) {
+    GELOGW("LoadOmModelFromCache: Load model from file fialed. ret = %u", ret);
+    return ret;
+  }
+
+  ModelHelper model_helper;
+  ret = model_helper.LoadModel(model_data);
+  if (ret != SUCCESS) {
+    GELOGW("LoadOmModelFromCache: Load model from data failed. ret = %u", ret);
+    return ret;
+  }
+  ge_model = model_helper.GetGeModel();
+  // Load TbeKernelBin to op desc from TBEKernelStore
+  const TBEKernelStore &tbekernel_store = ge_model->GetTBEKernelStore();
+  const ComputeGraphPtr compute_graph_in_model = GraphUtils::GetComputeGraph(ge_model->GetGraph());
+  for (const auto &node : compute_graph_in_model->GetDirectNode()) {
+    auto op_desc = node->GetOpDesc();
+    tbekernel_store.LoadTBEKernelBinToOpDesc(op_desc);
+    GELOGI("LoadOmModelFromCache: Load tbe kernel bin to op desc.");
+  }
+  return SUCCESS;
+}
+
+Status ModelCacheHelper::GetVarNameFromVarKey(const string &var_key, const GeTensorDesc &tensor_desc,
+                                              string &var_name) {
+  std::string::size_type underline_idx = var_key.rfind('_');
+  if (underline_idx == std::string::npos) {
+    GELOGW("Invalid var key: underline not found");
+    return FAILED;
+  }
+  std::string::size_type format_idx =
+    var_key.rfind(std::to_string(static_cast<int32_t>(tensor_desc.GetFormat())), underline_idx);
+  if (format_idx == std::string::npos) {
+    GELOGW("Invalid var key: format not found");
+    return FAILED;
+  }
+  var_name = var_key.substr(0, format_idx);
+  return SUCCESS;
+}
+}  // namespace ge
diff --git a/src/ge/common/helper/model_cache_helper.h b/src/ge/common/helper/model_cache_helper.h
new file mode 100644
index 00000000..91257282
--- /dev/null
+++ b/src/ge/common/helper/model_cache_helper.h
@@ -0,0 +1,121 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_COMMON_HELPER_MODEL_CACHE_HELPER_H_
+#define GE_COMMON_HELPER_MODEL_CACHE_HELPER_H_
+
+#include <nlohmann/json.hpp>
+#include <set>
+#include <string>
+
+#include "ge/ge_api_error_codes.h"
+#include "graph/compute_graph.h"
+#include "graph/manager/graph_var_manager.h"
+#include "model/ge_model.h"
+
+namespace ge {
+using Json = nlohmann::json;
+
+struct CacheInfo {
+  size_t node_num;
+  size_t edge_num;
+  size_t graph_hash;
+  map<std::string, size_t> nodes_hash;
+  CacheInfo() : node_num(0), edge_num(0), graph_hash(0) {}
+};
+
+class ModelCacheHelper {
+ public:
+  ModelCacheHelper(uint64_t session_id, uint32_t graph_id, ComputeGraphPtr &compute_graph);
+
+  Status SaveCacheInfoToCache() const;
+  Status SaveVarManagerToCache(bool before_build) const;
+  Status SaveOmModelToCache(const GeModelPtr &ge_model) const;
+  bool IsModelCacheHit() const;
+  Status RecoverVarManagerFromCache() const;
+  Status LoadOmModelFromCache(GeModelPtr &ge_model) const;
+  Status RefreshComputeGraph(const ComputeGraphPtr &compute_graph);
+  Status ClearCache(uint32_t graph_id) const;
+
+ private:
+  Status GetComputeGraphHash(size_t &hash) const;
+  Status GetNodesHash(map<std::string, size_t> &hash_map) const;
+  Status GetCacheInfo(CacheInfo &cache_info) const;
+
+  Status RecoverMemResource(const Json &json) const;
+  Status RecoverAllocatedGraphId(const Json &json) const;
+  Status RecoverChangedGraphId(const Json &json) const;
+  Status RecoverVarAddrAndTensorDesc(const Json &json) const;
+  Status RecoverBroadcastInfo(const Json &json) const;
+  Status RecoverTransRoads(const Json &json) const;
+  static Status RecompileNodes(GeModelPtr &ge_model);
+
+  bool IsNodeHashSameAsCache(const map<std::string, size_t> &hash_map) const;
+  bool IsMemResourceSameAsCache(Json &json) const;
+  bool IsChangedGraphIdSameAsCache(Json &json) const;
+  bool IsAllocatedGraphIdSameAsCache(Json &json) const;
+  bool IsCurVarTensorDescSameAsCache(Json &json) const;
+  bool IsVarAddrMgrMapSameAsCache(Json &json) const;
+  bool IsBroadcastInfoSameAsCache(Json &json) const;
+  bool IsTransRoadsSameAsCache(Json &json) const;
+  bool IsVarManagerSameAsCache(Json &json) const;
+  bool IsVarManagerParamSameAsCache(Json &json) const;
+
+  Status SaveJsonToFile(const string &file_name, const Json &json) const;
+  Status LoadJsonFromFile(const string &file_name, Json &json) const;
+
+  Status GetNodesHashMapJson(Json &json) const;
+  Status GetMemResourceMap(Json &json) const;
+  Status GetVarAddrMgrMapJson(Json &json) const;
+  Status GetCurVarTensorDescMapJson(Json &json) const;
+  Status GetTransRoadsJson(Json &json) const;
+  Status GetChangedGraphIdJson(Json &json) const;
+  Status GetAllocatedGraphIdJson(Json &json) const;
+  Status GetBroadcastInfoJson(Json &json) const;
+  Status GetVarResourceJson(Json &json) const;
+  Status GetVarManagerJson(Json &json) const;
+
+  static Status TensorDescToJson(const GeTensorDesc &ge_tensor_desc, Json &json);
+  static Status JsonToTensorDesc(const Json &json, GeTensorDesc &ge_tensor_desc);
+  static Status ParseMemResourceFromJson(const Json &json, map<rtMemType_t, int64_t> &mem_resource);
+  static Status ParseVarAddrMgrMapFromJson(const Json &json,
+                                           std::vector<std::pair<std::string, VarAddrMgr>> &var_addr_mgr_vector,
+                                           std::unordered_set<uint64_t> &var_offset_set);
+  static Status ParseCurVarTensorDescMapFromJson(
+    const Json &json, std::unordered_map<std::string, ge::GeTensorDesc> &cur_var_tensor_desc_map);
+  static Status ParseTransRoadsFromJson(const Json &json,
+                                        std::unordered_map<std::string, std::vector<TransNodeInfo>> &trans_roads);
+  static Status ParseChangedGraphIdFromJson(const Json &json,
+                                            std::unordered_map<std::string, uint32_t> &changed_graph_id);
+  static Status ParseAllocatedGraphIdFromJson(const Json &json,
+                                              std::unordered_map<std::string, uint32_t> &allocated_graph_id);
+  static Status ParseBroadcastInfoFromJson(const Json &json,
+                                           std::unordered_map<std::string, VarBroadCastInfo> &var_broadcast_info);
+  static Status GetVarNameFromVarKey(const string &var_key, const GeTensorDesc &tensor_desc, string &var_name);
+
+  uint64_t session_id_;
+  uint32_t graph_id_;
+  string cache_path_;
+  ComputeGraphPtr compute_graph_;
+  std::set<string> var_names_;
+  bool is_cache_path_valid_for_output;
+  static map<uint32_t, uint32_t> graph_id_run_times_;
+};
+
+using ModelCacheHelperPtr = std::shared_ptr<ModelCacheHelper>;
+}  // namespace ge
+
+#endif  // GE_COMMON_HELPER_MODEL_CACHE_HELPER_H_
diff --git a/src/ge/common/types.cc b/src/ge/common/types.cc
index da0853b6..e8ae5257 100644
--- a/src/ge/common/types.cc
+++ b/src/ge/common/types.cc
@@ -385,6 +385,7 @@ REGISTER_OPTYPE_DEFINE(STREAMSWITCH, "StreamSwitch");
 REGISTER_OPTYPE_DEFINE(STREAMSWITCHN, "StreamSwitchN");
 REGISTER_OPTYPE_DEFINE(STREAMACTIVE, "StreamActive");
 REGISTER_OPTYPE_DEFINE(MEMCPYASYNC, "MemcpyAsync");
+REGISTER_OPTYPE_DEFINE(MEMCPYADDRASYNC, "MemcpyAddrAsync");
 REGISTER_OPTYPE_DEFINE(STREAMMERGE, "StreamMerge");
 REGISTER_OPTYPE_DEFINE(ENDGRAPH, "EndGraph");
 REGISTER_OPTYPE_DEFINE(SEND, "Send");
@@ -392,6 +393,7 @@ REGISTER_OPTYPE_DEFINE(RECV, "Recv");
 
 REGISTER_OPTYPE_DEFINE(LABELSET, "LabelSet");
 REGISTER_OPTYPE_DEFINE(LABELGOTO, "LabelGoto");
+REGISTER_OPTYPE_DEFINE(LABELGOTOEX, "LabelGotoEx");
 REGISTER_OPTYPE_DEFINE(LABELSWITCH, "LabelSwitch");
 REGISTER_OPTYPE_DEFINE(LABELSWITCHBYINDEX, "LabelSwitchByIndex");
 
diff --git a/src/ge/common/util.cc b/src/ge/common/util.cc
index 79ead57b..f1a2fe6c 100644
--- a/src/ge/common/util.cc
+++ b/src/ge/common/util.cc
@@ -196,7 +196,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std::
   GE_CHK_BOOL_EXEC(!directory_path.empty(), return -1, "directory path is empty.");
   auto dir_path_len = directory_path.length();
   if (dir_path_len >= PATH_MAX) {
-    GELOGE(ge::FAILED, "Directory path is too long.");
+    GELOGW("Directory path is too long.");
     return -1;
   }
   char tmp_dir_path[PATH_MAX] = {0};
@@ -207,7 +207,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std::
         int32_t ret = mmMkdir(tmp_dir_path, S_IRUSR | S_IWUSR | S_IXUSR);  // 700
         if (ret != 0) {
           if (errno != EEXIST) {
-            GELOGE(ge::FAILED, "Cannot create directory %s. Make sure that the directory exists and writable.",
+            GELOGW("Cannot create directory %s. Make sure that the directory exists and writable.",
                    directory_path.c_str());
             return ret;
           }
@@ -218,8 +218,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std::
   int32_t ret = mmMkdir(const_cast<char *>(directory_path.c_str()), S_IRUSR | S_IWUSR | S_IXUSR);  // 700
   if (ret != 0) {
     if (errno != EEXIST) {
-      GELOGE(ge::FAILED, "Cannot create directory %s. Make sure that the directory exists and writable.",
-             directory_path.c_str());
+      GELOGW("Cannot create directory %s. Make sure that the directory exists and writable.", directory_path.c_str());
       return ret;
     }
   }
@@ -339,7 +338,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::string RealPath(const char
 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInputPathValid(const std::string &file_path) {
   // The specified path is empty
   if (file_path.empty()) {
-    GELOGE(ge::FAILED, "Path is empty.");
+    GELOGW("Path is empty.");
     return false;
   }
 
@@ -358,23 +357,23 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInputPathValid(const
   std::string real_path = RealPath(file_path.c_str());
   // Unable to get absolute path (does not exist or does not have permission to access)
   if (real_path.empty()) {
-    GELOGE(ge::FAILED, "Can not get real path for %s, %s", file_path.c_str(), strerror(errno));
+    GELOGW("Can not get real path for %s, %s", file_path.c_str(), strerror(errno));
     return false;
   }
 
   // The absolute path points to a file that is not readable
   if (access(real_path.c_str(), R_OK) != 0) {
-    GELOGE(ge::FAILED, "Can not read file in %s, %s", file_path.c_str(), strerror(errno));
+    GELOGW("Can not read file in %s, %s", file_path.c_str(), strerror(errno));
     return false;
   }
 
   return true;
 }
 
-FMK_FUNC_HOST_VISIBILITY bool CheckOutputPathValid(const std::string &file_path) {
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckOutputPathValid(const std::string &file_path) {
   // The specified path is empty
   if (file_path.empty()) {
-    GELOGE(ge::FAILED, "Path is empty.");
+    GELOGW("Path is empty.");
     return false;
   }
 
@@ -394,8 +393,8 @@ FMK_FUNC_HOST_VISIBILITY bool CheckOutputPathValid(const std::string &file_path)
   // Can get absolute path (file exists)
   if (!real_path.empty()) {
     // File is not readable or writable
-    if (access(real_path.c_str(), R_OK | W_OK | F_OK) != 0) {
-      GELOGE(ge::FAILED, "Path[ %s ] exists, but can not be write, %s", file_path.c_str(), strerror(errno));
+    if (access(real_path.c_str(), W_OK | F_OK) != 0) {
+      GELOGW("Path[ %s ] exists, but can not be write, %s", file_path.c_str(), strerror(errno));
       return false;
     }
   } else {
@@ -413,7 +412,7 @@ FMK_FUNC_HOST_VISIBILITY bool CheckOutputPathValid(const std::string &file_path)
       std::string prefix_path = std::string(file_path).substr(0, static_cast<size_t>(path_split_pos));
       // Determine whether the specified path is valid by creating the path
       if (CreateDirectory(prefix_path) != 0) {
-        GELOGE(ge::FAILED, "Can not create prefix path for path[ %s ].", file_path.c_str());
+        GELOGW("Can not create prefix path for path[ %s ].", file_path.c_str());
         return false;
       }
     }
diff --git a/src/ge/executor/CMakeLists.txt b/src/ge/executor/CMakeLists.txt
index 7401b062..2f09b50d 100755
--- a/src/ge/executor/CMakeLists.txt
+++ b/src/ge/executor/CMakeLists.txt
@@ -47,6 +47,7 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
         "../graph/load/new_model_manager/task_info/kernel_task_info.cc"
         "../graph/load/new_model_manager/task_info/label_goto_task_info.cc"
         "../graph/load/new_model_manager/task_info/label_set_task_info.cc"
+        "../graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc"
         "../graph/load/new_model_manager/task_info/memcpy_async_task_info.cc"
         "../graph/load/new_model_manager/task_info/profiler_trace_task_info.cc"
         "../graph/load/new_model_manager/task_info/stream_active_task_info.cc"
diff --git a/src/ge/ge_local_engine/engine/host_cpu_engine.h b/src/ge/ge_local_engine/engine/host_cpu_engine.h
index 88985f87..1987138d 100644
--- a/src/ge/ge_local_engine/engine/host_cpu_engine.h
+++ b/src/ge/ge_local_engine/engine/host_cpu_engine.h
@@ -21,7 +21,7 @@
 #include "framework/common/ge_inner_error_codes.h"
 #include "graph/node.h"
 #include "graph/operator.h"
-#include "register/register.h"
+#include "inc/register/register.h"
 
 namespace ge {
 class HostCpuEngine {
diff --git a/src/ge/ge_runtime/output.cc b/src/ge/ge_runtime/output.cc
index 2f4ade89..5153f688 100644
--- a/src/ge/ge_runtime/output.cc
+++ b/src/ge/ge_runtime/output.cc
@@ -76,7 +76,7 @@ bool Output::CopyRslt(OutputData *rslt, uint32_t data_begin, uint32_t &data_inde
     DataBuffer data_buf = rslt->blobs[data_begin + data_count];
     bool ret = SetDataBuf(data_buf, data_begin, data_count, i, support_mem_share);
     if (!ret) {
-      GELOGE(FAILED, "Copy data to host error. index: %lu", i);
+      GELOGE(FAILED, "Copy data to host error. index: %lu, addr: %p", i, v_input_data_addr_[i]);
       return ret;
     }
     data_index = data_begin + data_count;
diff --git a/src/ge/ge_runtime/runtime_model.cc b/src/ge/ge_runtime/runtime_model.cc
index ffb0d8a0..330ffc14 100644
--- a/src/ge/ge_runtime/runtime_model.cc
+++ b/src/ge/ge_runtime/runtime_model.cc
@@ -96,6 +96,7 @@ bool RuntimeModel::InitStream(std::shared_ptr<DavinciModel> &davinci_model) {
       GELOGE(RT_FAILED, "Call rt api rtModelBindStream failed, ret: 0x%X", rt_ret);
       return false;
     }
+    GELOGI("stream index:%u, stream:%p.", i, stream);
   }
 
   return true;
@@ -446,8 +447,11 @@ bool RuntimeModel::InitConstantInfo(std::shared_ptr<DavinciModel> &davinci_model
       /// The logic of GetShapeSize is wrong, the scaler tensor's GetShapeSize is zero
       /// and that of unknown shape is zero too.
       /// Unknown shape will not appear here, so we can use zero judge a tensor is scaler or not.
-      int64_t elem_num =
-        (constant->weight_tensors[0].GetShapeSize() == 0) ? 1 : constant->weight_tensors[0].GetShapeSize();
+      int64_t elem_num = constant->weight_tensors[0].GetShapeSize();
+      if (elem_num == 0 && constant->weight_tensors[0].size == 0) {
+        elem_num = 1;
+      }
+
       if (constant->weight_data.size() < sizeof(uint64_t)) {
         GELOGE(FAILED, "weight_data size is smaller than sizeof(uint64_t)");
         return false;
diff --git a/src/ge/ge_runtime/task/cce_task.cc b/src/ge/ge_runtime/task/cce_task.cc
index e5ea99c0..04fd5610 100644
--- a/src/ge/ge_runtime/task/cce_task.cc
+++ b/src/ge/ge_runtime/task/cce_task.cc
@@ -82,6 +82,7 @@ bool CceTask::Distribute() {
     stub_func_ = nullptr;
     return false;
   }
+  GELOGI("CCETask: stub_func = %s [%p].", task_info_->stub_func().c_str(), stub_func_);
 
   // Flowtable
   if (is_flowtable_) {
diff --git a/src/ge/ge_runtime/task/event_record_task.cc b/src/ge/ge_runtime/task/event_record_task.cc
index 46ac7a1b..85ddc053 100644
--- a/src/ge/ge_runtime/task/event_record_task.cc
+++ b/src/ge/ge_runtime/task/event_record_task.cc
@@ -43,6 +43,8 @@ EventRecordTask::EventRecordTask(const ModelContext &model_context,
 EventRecordTask::~EventRecordTask() {}
 
 bool EventRecordTask::Distribute() {
+  GELOGI("EventRecordTask Distribute start, stream: %p, event: %p, stream_id: %u, event_id: %u.", stream_, event_,
+         task_info_->stream_id(), task_info_->event_id());
   rtError_t rt_ret = rtEventRecord(event_, stream_);
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
diff --git a/src/ge/ge_runtime/task/event_wait_task.cc b/src/ge/ge_runtime/task/event_wait_task.cc
index e4cf986f..558c2a59 100644
--- a/src/ge/ge_runtime/task/event_wait_task.cc
+++ b/src/ge/ge_runtime/task/event_wait_task.cc
@@ -42,6 +42,9 @@ EventWaitTask::EventWaitTask(const ModelContext &model_context, const std::share
 EventWaitTask::~EventWaitTask() {}
 
 bool EventWaitTask::Distribute() {
+  GELOGI("EventWaitTask Distribute start, stream: %p, event: %p, stream_id: %u, event_id: %u.", stream_, event_,
+         task_info_->stream_id(), task_info_->event_id());
+
   rtError_t rt_ret = rtStreamWaitEvent(stream_, event_);
   if (rt_ret != RT_ERROR_NONE) {
     GELOGE(RT_FAILED, "Call rt api rtStreamWaitEvent failed, ret: 0x%X", rt_ret);
diff --git a/src/ge/ge_runtime/task/hccl_task.cc b/src/ge/ge_runtime/task/hccl_task.cc
index 0794c0e9..7a513597 100644
--- a/src/ge/ge_runtime/task/hccl_task.cc
+++ b/src/ge/ge_runtime/task/hccl_task.cc
@@ -101,6 +101,7 @@ bool HcclTask::Distribute() {
 
   char *private_def = reinterpret_cast<char *>(const_cast<char unsigned *>(task_info_->private_def().data()));
   auto private_def_len = static_cast<uint32_t>(task_info_->private_def().size());
+  GELOGI("the first address of the custom info, privateDef=%p", private_def);
 
   GELOGI("hcclStreamNum =%ld", task_info_->hccl_stream_num());
   for (int64_t i = 0; i < task_info_->hccl_stream_num(); ++i) {
@@ -117,6 +118,7 @@ bool HcclTask::Distribute() {
       return false;
     }
 
+    GELOGI("hccl_stream addr is=%p", stream);
     slave_stream_list_.push_back(stream);
   }
 
diff --git a/src/ge/ge_runtime/task/stream_switch_task.cc b/src/ge/ge_runtime/task/stream_switch_task.cc
index afbdba18..2adcb4bd 100644
--- a/src/ge/ge_runtime/task/stream_switch_task.cc
+++ b/src/ge/ge_runtime/task/stream_switch_task.cc
@@ -62,6 +62,9 @@ bool StreamSwitchTask::Distribute() {
   rtStream_t true_stream = stream_list_[task_info_->true_stream_id()];
   rtSwitchDataType_t data_type = static_cast<rtSwitchDataType_t>(task_info_->data_type());
 
+  GELOGI("InitStreamSwitchTask, cond:%d, trueStream:%p, trueStreamID:%ld, datatype:%ld.", cond, true_stream,
+         task_info_->true_stream_id(), task_info_->data_type());
+
   GELOGI("StreamSwitchTask Distribute Start.");
   rtError_t rt_ret = rtStreamSwitchEx(input, cond, value, true_stream, stream_, data_type);
   if (rt_ret != RT_ERROR_NONE) {
@@ -69,6 +72,7 @@ bool StreamSwitchTask::Distribute() {
     return false;
   }
 
+  GELOGI("Distribute StreamSwitch, cond:%d, trueStream:%p, datatype:%ld.", cond, true_stream, task_info_->data_type());
   return true;
 }
 
diff --git a/src/ge/ge_runtime/task/tbe_task.cc b/src/ge/ge_runtime/task/tbe_task.cc
index 19056c1b..8a3c36a4 100644
--- a/src/ge/ge_runtime/task/tbe_task.cc
+++ b/src/ge/ge_runtime/task/tbe_task.cc
@@ -69,6 +69,7 @@ bool TbeTask::Distribute() {
     stub_func_ = nullptr;
     return false;
   }
+  GELOGI("TbeTask: stub_func = %s [%p].", task_info_->stub_func().c_str(), stub_func_);
 
   // Get args
   std::vector<void *> tensor_device_addrs;
diff --git a/src/ge/graph/build/graph_builder.cc b/src/ge/graph/build/graph_builder.cc
index 9424a4ed..957ddc2d 100644
--- a/src/ge/graph/build/graph_builder.cc
+++ b/src/ge/graph/build/graph_builder.cc
@@ -18,8 +18,8 @@
 #include "common/ge/ge_util.h"
 #include "common/helper/model_helper.h"
 #include "common/opskernel/ops_kernel_info_types.h"
-#include "graph/build/stream_graph_optimizer.h"
 #include "graph/build/run_context.h"
+#include "graph/build/stream_graph_optimizer.h"
 #include "graph/manager/graph_var_manager.h"
 #include "graph/utils/node_utils.h"
 #include "graph/utils/type_utils.h"
@@ -98,8 +98,10 @@ Status GraphBuilder::Build(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfo
 
   Status ret = SecondPartition(comp_graph, subgraph_ptr_list);
   GE_CHK_STATUS_RET(ret, "Graph second partition Failed.");
+  auto subgraph_map = graph_partitioner_.GetSubGraphMap();
+
   GE_TIMESTAMP_START(BuildSubgraph);
-  ge::ModelBuilder builder(comp_graph, subgraph_ptr_list, stream_max_parallel_num_, hcom_parallel_, build_mode_);
+  ge::ModelBuilder builder(comp_graph, subgraph_map, stream_max_parallel_num_, hcom_parallel_, build_mode_);
 
   GELOGI("[Build] invoke the other opskernel to generate task.");
 
@@ -135,7 +137,7 @@ Status GraphBuilder::Build(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfo
   }
 
   GE_TIMESTAMP_START(GetTaskInfo);
-  ret = GetTaskInfo(builder, model_ptr, comp_graph, subgraph_ptr_list, session_id);
+  ret = GetTaskInfo(builder, model_ptr, comp_graph, subgraph_map, session_id);
   GE_TIMESTAMP_END(GetTaskInfo, "GraphBuilder::GetTaskInfo");
 
   GraphUtils::DumpGEGraph(comp_graph, "AfterGetTask");
@@ -155,7 +157,7 @@ Status GraphBuilder::Build(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfo
 }
 
 Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr &model_ptr,
-                                 ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
+                                 ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map,
                                  uint64_t session_id) {
   GE_CHECK_NOTNULL(model_ptr);
   GE_CHECK_NOTNULL(comp_graph);
@@ -190,7 +192,7 @@ Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr
   }
 
   StreamGraphOptimizer stream_optimizer;
-  ret = stream_optimizer.OptimizeStreamedSubGraph(comp_graph, subgraph_ptr_list, run_context.GetRunContext());
+  ret = stream_optimizer.OptimizeStreamedSubGraph(comp_graph, subgraph_map, run_context.GetRunContext());
   if (ret != SUCCESS) {
     GELOGE(ret, "Optimize streamed subGraph fail.");
     return ret;
diff --git a/src/ge/graph/build/graph_builder.h b/src/ge/graph/build/graph_builder.h
index c1c4f7b6..d0bf26e6 100644
--- a/src/ge/graph/build/graph_builder.h
+++ b/src/ge/graph/build/graph_builder.h
@@ -53,7 +53,7 @@ class GraphBuilder {
  private:
   Status CalcOpParam(const ge::ComputeGraphPtr &graph);
   Status GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr &model_ptr, ComputeGraphPtr &comp_graph,
-                     std::vector<SubGraphInfoPtr> &subgraph_ptr_list, uint64_t session_id = INVALID_SESSION_ID);
+                     Graph2SubGraphInfoList &subgraph_map, uint64_t session_id = INVALID_SESSION_ID);
   Status SetInputSize(const ge::NodePtr &node_ptr);
   Status UpdateDataInputSize(const ge::NodePtr &node_ptr);
   Status SecondPartition(ge::ComputeGraphPtr &comp_graph, vector<ge::SubGraphInfoPtr> &subgraph_ptr_list);
diff --git a/src/ge/graph/build/logical_stream_allocator.cc b/src/ge/graph/build/logical_stream_allocator.cc
index 2b11347b..16c4935e 100644
--- a/src/ge/graph/build/logical_stream_allocator.cc
+++ b/src/ge/graph/build/logical_stream_allocator.cc
@@ -70,7 +70,7 @@ bool LogicalStreamPass::HasNonConstInputNode(const Subgraph &subgraph) const {
   return false;
 }
 
-Status AssignByLabelPass::Run(ComputeGraphPtr whole_graph, const vector<SubgraphPtr> &subgraphs, Context &context) {
+Status AssignByLabelPass::Run(ComputeGraphPtr graph, const vector<SubgraphPtr> &subgraphs, Context &context) {
   bool changed = false;
   int64_t &next_stream = context.next_stream;
   map<string, int64_t> label_streams;
@@ -97,7 +97,7 @@ Status AssignByLabelPass::Run(ComputeGraphPtr whole_graph, const vector<Subgraph
   return changed ? SUCCESS : NOT_CHANGED;
 }
 
-Status IndependentStreamPass::Run(ComputeGraphPtr whole_graph, const vector<SubgraphPtr> &subgraphs, Context &context) {
+Status IndependentStreamPass::Run(ComputeGraphPtr graph, const vector<SubgraphPtr> &subgraphs, Context &context) {
   bool changed = false;
   int64_t &next_stream = context.next_stream;
 
@@ -129,8 +129,7 @@ Status IndependentStreamPass::Run(ComputeGraphPtr whole_graph, const vector<Subg
   return changed ? SUCCESS : NOT_CHANGED;
 }
 
-Status AssignByDependencyPass::Run(ComputeGraphPtr whole_graph, const vector<SubgraphPtr> &subgraphs,
-                                   Context &context) {
+Status AssignByDependencyPass::Run(ComputeGraphPtr graph, const vector<SubgraphPtr> &subgraphs, Context &context) {
   bool changed = false;
   if (IsHeadNodeExceeded(subgraphs)) {
     int64_t &next_stream = context.next_stream;
@@ -298,7 +297,7 @@ int64_t AssignByDependencyPass::AssignNewStream(SubgraphPtr subgraph) {
 
   subgraph->stream_id = stream_id;
   engine_next_streams_[engine_name] = stream_id + 1;
-  assigned_subgraphs_.emplace(subgraph);
+  assigned_subgraphs_.emplace_back(subgraph);
 
   if ((stream_id + 1) > engine_stream_num_[engine_name]) {
     engine_stream_num_[engine_name] = stream_id + 1;
@@ -311,6 +310,15 @@ int64_t AssignByDependencyPass::AssignNewStream(SubgraphPtr subgraph) {
 }
 
 void AssignByDependencyPass::UpdateAssignedSubgraphs(Context &context) {
+  // If the parent stream is valid, the first assigned stream will reuse the parent stream id
+  // and other streams use new id. To ensure that the id of the new stream is continuous,
+  // we first subtract one from next_stream.
+  int64_t to_be_updated_stream = kInvalidStream;
+  if (context.parent_stream != kInvalidStream) {
+    context.next_stream--;
+    to_be_updated_stream = context.next_stream;
+  }
+
   // Update the starting stream id for each engine.
   int64_t &next_stream = context.next_stream;
   map<string, int64_t> engine_start_streams;
@@ -320,10 +328,16 @@ void AssignByDependencyPass::UpdateAssignedSubgraphs(Context &context) {
     next_stream += stream_count;
   }
 
-  // Update the subgraphs assigned by the engine.
+  // Update the subgraph streams assigned by engine.
   for (auto &subgraph : assigned_subgraphs_) {
     subgraph->stream_id += engine_start_streams[subgraph->engine_conf.id];
-    GELOGI("Stream of subgraph %s has been updated to %ld.", subgraph->name.c_str(), subgraph->stream_id);
+    if (subgraph->stream_id == to_be_updated_stream) {
+      subgraph->stream_id = context.parent_stream;
+      GELOGI("Subgraph %s of engine %s reuses parent stream %ld.", subgraph->name.c_str(),
+             subgraph->engine_conf.id.c_str(), context.parent_stream);
+    } else {
+      GELOGI("Stream of subgraph %s has been updated to %ld.", subgraph->name.c_str(), subgraph->stream_id);
+    }
   }
 }
 
@@ -337,7 +351,7 @@ void AssignByDependencyPass::UpdateReusedSubgraphs() {
   }
 }
 
-Status NodeStreamUpdatePass::Run(ComputeGraphPtr whole_graph, const vector<SubgraphPtr> &subgraphs, Context &context) {
+Status NodeStreamUpdatePass::Run(ComputeGraphPtr graph, const vector<SubgraphPtr> &subgraphs, Context &context) {
   // Check if all subgraphs have been assigned a stream.
   for (const SubgraphPtr &subgraph : subgraphs) {
     const string &engine_name = subgraph->engine_conf.id;
@@ -353,7 +367,7 @@ Status NodeStreamUpdatePass::Run(ComputeGraphPtr whole_graph, const vector<Subgr
   }
 
   // Init the stream id of node.
-  for (NodePtr &node : whole_graph->GetDirectNode()) {
+  for (NodePtr &node : graph->GetDirectNode()) {
     GE_CHECK_NOTNULL(node->GetOpDesc());
     node->GetOpDesc()->SetStreamId(kInvalidStream);
   }
@@ -375,76 +389,11 @@ Status NodeStreamUpdatePass::Run(ComputeGraphPtr whole_graph, const vector<Subgr
   }
 
   // Update stream id for nodes belong to skipped engine subgraph
-  GE_CHK_STATUS_RET(UpdateForSkippedEngine(whole_graph, subgraphs));
-
-  RefreshContinuousStreams(whole_graph, context);
+  GE_CHK_STATUS_RET(UpdateForSkippedEngine(graph, subgraphs));
 
   return SUCCESS;
 }
 
-Status AllReduceParallelPass::Run(ComputeGraphPtr whole_graph, const vector<SubgraphPtr> &subgraphs, Context &context) {
-  if (!context.hcom_parallel) {
-    return NOT_CHANGED;
-  }
-
-  GELOGI("AllReduceParallelPass is enabled.");
-  GraphUtils::DumpGEGraph(whole_graph, "BeforeAllReduceParallel");
-
-  // All successors of HcomAllReduce.
-  set<NodePtr> all_reduce_succs;
-
-  for (const NodePtr &node : whole_graph->GetDirectNode()) {
-    if (node->GetType() != HCOMALLREDUCE || node->GetInDataNodes().size() <= 1) {
-      continue;
-    }
-
-    string reduce_stream_label;
-    GE_CHECK_NOTNULL(node->GetOpDesc());
-    // ATTR_NAME_STREAM_LABEL is optional.
-    (void)AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, reduce_stream_label);
-
-    set<NodePtr> cur_nodes = {node};
-    while (!cur_nodes.empty()) {
-      set<NodePtr> all_out_data_nodes;
-      for (auto &curr_node : cur_nodes) {
-        for (const NodePtr &out_node : curr_node->GetOutDataNodes()) {
-          string out_stream_label;
-          GE_CHECK_NOTNULL(out_node->GetOpDesc());
-          // ATTR_NAME_STREAM_LABEL is optional.
-          (void)AttrUtils::GetStr(out_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, out_stream_label);
-          if (out_stream_label == reduce_stream_label) {
-            all_reduce_succs.emplace(out_node);
-            all_out_data_nodes.emplace(out_node);
-          }
-        }
-      }
-      cur_nodes = all_out_data_nodes;
-    }
-  }
-
-  map<int64_t, int64_t> old_stream_to_new;
-  for (const NodePtr &node : all_reduce_succs) {
-    GE_CHECK_NOTNULL(node->GetOpDesc());
-    auto old_stream = node->GetOpDesc()->GetStreamId();
-    if (old_stream != kInvalidStream) {
-      int64_t new_stream = kInvalidStream;
-      auto iter = old_stream_to_new.find(old_stream);
-      if (iter != old_stream_to_new.end()) {
-        new_stream = iter->second;
-      } else {
-        new_stream = context.next_stream;
-        context.next_stream++;
-        old_stream_to_new.emplace(old_stream, new_stream);
-      }
-
-      GELOGI("Stream of node %s has been updated from %ld to %ld.", node->GetName().c_str(), old_stream, new_stream);
-      node->GetOpDesc()->SetStreamId(new_stream);
-    }
-  }
-
-  return !all_reduce_succs.empty() ? SUCCESS : NOT_CHANGED;
-}
-
 int64_t NodeStreamUpdatePass::GetSingleInoutStream(const NodePtr &node) const {
   set<int64_t> stream_ids;
 
@@ -472,11 +421,11 @@ int64_t NodeStreamUpdatePass::GetSingleInoutStream(const NodePtr &node) const {
   return kInvalidStream;
 }
 
-Status NodeStreamUpdatePass::UpdateForSkippedEngine(const ComputeGraphPtr &whole_graph,
+Status NodeStreamUpdatePass::UpdateForSkippedEngine(const ComputeGraphPtr &graph,
                                                     const vector<SubgraphPtr> &subgraphs) {
   set<OpDescPtr> nodes_to_be_updated;
 
-  // Check if sub graph is engine skipped and without stream label or not
+  // Check if subgraph is engine skipped and without stream label or not
   for (const SubgraphPtr &subgraph : subgraphs) {
     if (IsEngineSkip(*subgraph) && !HasStreamLabel(*subgraph)) {
       auto graph = subgraph->subgraph_info.GetSubGraph();
@@ -492,7 +441,7 @@ Status NodeStreamUpdatePass::UpdateForSkippedEngine(const ComputeGraphPtr &whole
   }
 
   // Try reassign the stream id
-  for (ge::NodePtr &node : whole_graph->GetDirectNode()) {
+  for (ge::NodePtr &node : graph->GetDirectNode()) {
     auto op_desc = node->GetOpDesc();
     GE_CHECK_NOTNULL(op_desc);
     int64_t stream_id = op_desc->GetStreamId();
@@ -509,6 +458,7 @@ Status NodeStreamUpdatePass::UpdateForSkippedEngine(const ComputeGraphPtr &whole
       }
     }
   }
+
   return SUCCESS;
 }
 
@@ -525,40 +475,65 @@ bool NodeStreamUpdatePass::AreAllPredStreamsInvalid(const NodePtr &node) const {
   return true;
 }
 
-void NodeStreamUpdatePass::RefreshContinuousStreams(ComputeGraphPtr whole_graph, Context &context) const {
-  int64_t stream_num = context.next_stream;
-  vector<bool> stream_has_node(stream_num);
+Status AllReduceParallelPass::Run(ComputeGraphPtr graph, const vector<SubgraphPtr> &subgraphs, Context &context) {
+  if (!context.hcom_parallel) {
+    return NOT_CHANGED;
+  }
 
-  for (const NodePtr &node : whole_graph->GetDirectNode()) {
-    if (node != nullptr) {
-      auto op_desc = node->GetOpDesc();
-      if (op_desc != nullptr) {
-        int64_t stream_id = op_desc->GetStreamId();
-        if (stream_id != kInvalidStream && stream_id < stream_num) {
-          stream_has_node[stream_id] = true;
-        }
-      }
+  GELOGI("AllReduceParallelPass is enabled.");
+  GraphUtils::DumpGEGraph(graph, "BeforeAllReduceParallel");
+
+  // All successors of HcomAllReduce.
+  set<NodePtr> all_reduce_succs;
+
+  for (const NodePtr &node : graph->GetDirectNode()) {
+    if (node->GetType() != HCOMALLREDUCE || node->GetInDataNodes().size() <= 1) {
+      continue;
     }
-  }
 
-  context.next_stream = 0;
-  vector<int64_t> old_to_new_streams(stream_num, kInvalidStream);
-  for (size_t old_stream = 0; old_stream < stream_has_node.size(); ++old_stream) {
-    if (stream_has_node[old_stream]) {
-      old_to_new_streams[old_stream] = context.next_stream;
-      ++context.next_stream;
+    string reduce_stream_label;
+    GE_CHECK_NOTNULL(node->GetOpDesc());
+    (void)AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, reduce_stream_label);
+
+    set<NodePtr> cur_nodes = {node};
+    while (!cur_nodes.empty()) {
+      set<NodePtr> all_out_data_nodes;
+      for (auto &curr_node : cur_nodes) {
+        for (const NodePtr &out_node : curr_node->GetOutDataNodes()) {
+          string out_stream_label;
+          GE_CHECK_NOTNULL(out_node->GetOpDesc());
+          (void)AttrUtils::GetStr(out_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, out_stream_label);
+          if (out_stream_label == reduce_stream_label) {
+            all_reduce_succs.emplace(out_node);
+            all_out_data_nodes.emplace(out_node);
+          }
+        }
+      }
+      cur_nodes = all_out_data_nodes;
     }
   }
 
-  for (const NodePtr &node : whole_graph->GetDirectNode()) {
-    auto op_desc = node->GetOpDesc();
-    if (op_desc != nullptr) {
-      int64_t stream_id = op_desc->GetStreamId();
-      if (stream_id != kInvalidStream && stream_id < stream_num) {
-        op_desc->SetStreamId(old_to_new_streams[stream_id]);
+  map<int64_t, int64_t> old_stream_to_new;
+  for (const NodePtr &node : all_reduce_succs) {
+    GE_CHECK_NOTNULL(node->GetOpDesc());
+    auto old_stream = node->GetOpDesc()->GetStreamId();
+    if (old_stream != kInvalidStream) {
+      int64_t new_stream = kInvalidStream;
+      auto iter = old_stream_to_new.find(old_stream);
+      if (iter != old_stream_to_new.end()) {
+        new_stream = iter->second;
+      } else {
+        new_stream = context.next_stream;
+        context.next_stream++;
+        old_stream_to_new.emplace(old_stream, new_stream);
       }
+
+      GELOGI("Stream of node %s has been updated from %ld to %ld.", node->GetName().c_str(), old_stream, new_stream);
+      node->GetOpDesc()->SetStreamId(new_stream);
     }
   }
+
+  return !all_reduce_succs.empty() ? SUCCESS : NOT_CHANGED;
 }
 
 LogicalStreamAllocator::LogicalStreamAllocator(const map<string, SchedulerConf> &scheduler_confs,
@@ -567,9 +542,10 @@ LogicalStreamAllocator::LogicalStreamAllocator(const map<string, SchedulerConf>
   context_.hcom_parallel = hcom_parallel;
 }
 
-Status LogicalStreamAllocator::Assign(const ComputeGraphPtr &whole_graph, const vector<SubGraphInfoPtr> &subgraph_infos,
+Status LogicalStreamAllocator::Assign(const ComputeGraphPtr &whole_graph, const Graph2SubGraphInfoList &subgraph_map,
                                       int64_t &stream_num) {
   GE_CHECK_NOTNULL(whole_graph);
+
   map<string, EngineConfPtr> engine_confs;
   GE_TIMESTAMP_START(InitEngineConfs);
   for (const auto &item : scheduler_confs_) {
@@ -583,16 +559,64 @@ Status LogicalStreamAllocator::Assign(const ComputeGraphPtr &whole_graph, const
   }
   GE_TIMESTAMP_END(InitEngineConfs, "GraphBuilder::AssignStreamInitEngineConfs");
 
+  Status status = DoAssign(whole_graph, subgraph_map, engine_confs);
+  if (status != SUCCESS) {
+    GELOGE(status, "Assign streams failed.");
+    return status;
+  }
+
+  vector<ComputeGraphPtr> subgraphs = whole_graph->GetAllSubgraphs();
+  for (const ComputeGraphPtr &subgraph : subgraphs) {
+    Status status = DoAssign(subgraph, subgraph_map, engine_confs);
+    if (status != SUCCESS) {
+      GELOGE(status, "Assign streams failed.");
+      return status;
+    }
+  }
+
+  RefreshContinuousStreams(whole_graph);
+
+  stream_num = context_.next_stream;
+  GELOGI("Assigned logical stream num: %ld.", stream_num);
+
+  return SUCCESS;
+}
+
+Status LogicalStreamAllocator::DoAssign(const ComputeGraphPtr &graph, const Graph2SubGraphInfoList &subgraph_map,
+                                        const map<string, EngineConfPtr> &engine_confs) {
+  GE_CHECK_NOTNULL(graph);
+
+  NodePtr parent_node = graph->GetParentNode();
+  if (parent_node == nullptr || parent_node->GetOpDesc() == nullptr) {
+    context_.parent_stream = kInvalidStream;
+  } else {
+    context_.parent_stream = parent_node->GetOpDesc()->GetStreamId();
+  }
+
+  auto iter = subgraph_map.find(graph);
+  if (iter == subgraph_map.end()) {
+    GELOGE(FAILED, "Graph %s not found.", graph->GetName().c_str());
+    return FAILED;
+  }
+
+  const vector<SubGraphInfoPtr> &subgraph_info_list = iter->second;
   vector<SubgraphPtr> subgraphs;
   GE_TIMESTAMP_START(ConvertSubgraphs);
-  Status status = ConvertSubgraphs(subgraph_infos, engine_confs, subgraphs);
+  Status status = ConvertSubgraphs(subgraph_info_list, engine_confs, subgraphs);
   GE_TIMESTAMP_END(ConvertSubgraphs, "GraphBuilder::AssignStreamConvertSubgraphs");
   if (status != SUCCESS) {
     GELOGE(status, "Create subgraphs failed.");
     return status;
   }
 
-  return RunPasses(whole_graph, subgraphs, stream_num);
+  GELOGI("Subgraphs of graph %s:", graph->GetName().c_str());
+  for (const auto &subgraph : subgraphs) {
+    if (subgraph != nullptr) {
+      GELOGI("subgraph: %s", subgraph->name.c_str());
+    }
+  }
+
+  return RunPasses(graph, subgraphs);
 }
 
 Status LogicalStreamAllocator::ConvertSubgraphs(const vector<SubGraphInfoPtr> &subgraph_infos,
@@ -631,8 +655,7 @@ Status LogicalStreamAllocator::ConvertSubgraphs(const vector<SubGraphInfoPtr> &s
   return SUCCESS;
 }
 
-Status LogicalStreamAllocator::RunPasses(const ComputeGraphPtr &whole_graph, const vector<SubgraphPtr> &subgraphs,
-                                         int64_t &stream_num) {
+Status LogicalStreamAllocator::RunPasses(const ComputeGraphPtr &graph, const vector<SubgraphPtr> &subgraphs) {
   vector<LogicalStreamPassPtr> passes;
   passes.emplace_back(MakeShared<AssignByLabelPass>());
   passes.emplace_back(MakeShared<IndependentStreamPass>());
@@ -643,7 +666,7 @@ Status LogicalStreamAllocator::RunPasses(const ComputeGraphPtr &whole_graph, con
   for (auto &pass : passes) {
     GE_CHECK_NOTNULL(pass);
 
-    Status status = pass->Run(whole_graph, subgraphs, context_);
+    Status status = pass->Run(graph, subgraphs, context_);
     if (status == SUCCESS) {
       GELOGI("Stream pass %s return SUCCESS.", pass->GetName().c_str());
     } else if (status == NOT_CHANGED) {
@@ -654,9 +677,42 @@ Status LogicalStreamAllocator::RunPasses(const ComputeGraphPtr &whole_graph, con
     }
   }
 
-  stream_num = context_.next_stream;
-  GELOGI("Assigned logical stream num: %ld.", stream_num);
-
   return SUCCESS;
 }
+
+void LogicalStreamAllocator::RefreshContinuousStreams(const ComputeGraphPtr &graph) {
+  int64_t stream_num = context_.next_stream;
+  vector<bool> stream_has_node(stream_num);
+
+  for (const NodePtr &node : graph->GetAllNodes()) {
+    if (node != nullptr) {
+      auto op_desc = node->GetOpDesc();
+      if (op_desc != nullptr) {
+        int64_t stream_id = op_desc->GetStreamId();
+        if (stream_id != kInvalidStream && stream_id < stream_num) {
+          stream_has_node[stream_id] = true;
+        }
+      }
+    }
+  }
+
+  context_.next_stream = 0;
+  vector<int64_t> old_to_new_streams(stream_num, kInvalidStream);
+  for (size_t old_stream = 0; old_stream < stream_has_node.size(); ++old_stream) {
+    if (stream_has_node[old_stream]) {
+      old_to_new_streams[old_stream] = context_.next_stream;
+      ++context_.next_stream;
+    }
+  }
+
+  for (const NodePtr &node : graph->GetAllNodes()) {
+    auto op_desc = node->GetOpDesc();
+    if (op_desc != nullptr) {
+      int64_t stream_id = op_desc->GetStreamId();
+      if (stream_id != kInvalidStream && stream_id < stream_num) {
+        op_desc->SetStreamId(old_to_new_streams[stream_id]);
+      }
+    }
+  }
+}
 }  // namespace ge
diff --git a/src/ge/graph/build/logical_stream_allocator.h b/src/ge/graph/build/logical_stream_allocator.h
index 2265a0f3..404d22f9 100644
--- a/src/ge/graph/build/logical_stream_allocator.h
+++ b/src/ge/graph/build/logical_stream_allocator.h
@@ -60,7 +60,7 @@ class LogicalStreamPass {
   };
 
   struct Context {
-    // Next stream id.
+    int64_t parent_stream = kInvalidStream;
     int64_t next_stream = 0;
     bool hcom_parallel = false;
   };
@@ -71,7 +71,7 @@ class LogicalStreamPass {
   virtual ~LogicalStreamPass() = default;
 
   const std::string &GetName() const;
-  virtual Status Run(ComputeGraphPtr whole_graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) = 0;
+  virtual Status Run(ComputeGraphPtr graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) = 0;
 
  protected:
   bool IsEngineSkip(const Subgraph &subgraph) const;
@@ -93,21 +93,21 @@ using LogicalStreamPassPtr = std::shared_ptr<LogicalStreamPass>;
 class AssignByLabelPass : public LogicalStreamPass {
  public:
   STREAM_PASS_DEFAULT_FUNC(AssignByLabelPass);
-  Status Run(ComputeGraphPtr whole_graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) override;
+  Status Run(ComputeGraphPtr graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) override;
 };
 
 // Engines such as hccl require independent Stream.
 class IndependentStreamPass : public LogicalStreamPass {
  public:
   STREAM_PASS_DEFAULT_FUNC(IndependentStreamPass);
-  Status Run(ComputeGraphPtr whole_graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) override;
+  Status Run(ComputeGraphPtr graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) override;
 };
 
 // Reuse streams or assign new streams based on dependencies.
 class AssignByDependencyPass : public LogicalStreamPass {
  public:
   STREAM_PASS_DEFAULT_FUNC(AssignByDependencyPass);
-  Status Run(ComputeGraphPtr whole_graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) override;
+  Status Run(ComputeGraphPtr graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) override;
 
  private:
   void InitEndSubgraphMap(const std::vector<SubgraphPtr> &subgraphs, std::map<NodePtr, SubgraphPtr> &end_subgraph_map);
@@ -132,7 +132,7 @@ class AssignByDependencyPass : public LogicalStreamPass {
   std::map<std::string, int64_t> engine_stream_num_;
 
   // Subgraphs of assign stream by engine
-  std::set<SubgraphPtr> assigned_subgraphs_;
+  std::vector<SubgraphPtr> assigned_subgraphs_;
 
   // <current subgraph, reused subgraph>
   std::vector<std::pair<SubgraphPtr, SubgraphPtr>> reused_subgraphs_;
@@ -142,7 +142,7 @@ class AssignByDependencyPass : public LogicalStreamPass {
 class NodeStreamUpdatePass : public LogicalStreamPass {
  public:
   STREAM_PASS_DEFAULT_FUNC(NodeStreamUpdatePass);
-  Status Run(ComputeGraphPtr whole_graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) override;
+  Status Run(ComputeGraphPtr graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) override;
 
  private:
   /// Optimize for case like:
@@ -150,19 +150,18 @@ class NodeStreamUpdatePass : public LogicalStreamPass {
   /// To case:
   ///  NodeA(stream1) -> Const(stream1) -> NodeB(stream1)
   /// Which could reduce event number (Const could be other type which belong to skipped engine subgraph)
-  Status UpdateForSkippedEngine(const ComputeGraphPtr &whole_graph, const std::vector<SubgraphPtr> &subgraphs);
+  Status UpdateForSkippedEngine(const ComputeGraphPtr &graph, const std::vector<SubgraphPtr> &subgraphs);
 
   int64_t GetSingleInoutStream(const NodePtr &node) const;
   // Judge if all predecessors' streams of node are INVALID_STREAM
   bool AreAllPredStreamsInvalid(const NodePtr &node) const;
-  void RefreshContinuousStreams(ComputeGraphPtr whole_graph, Context &context) const;
 };
 
 // AllReduce and backward operators execute in parallel.
 class AllReduceParallelPass : public LogicalStreamPass {
  public:
   STREAM_PASS_DEFAULT_FUNC(AllReduceParallelPass);
-  Status Run(ComputeGraphPtr whole_graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) override;
+  Status Run(ComputeGraphPtr graph, const std::vector<SubgraphPtr> &subgraphs, Context &context) override;
 };
 
 // Assign logical streams which is not limited by the number of tasks.
@@ -178,13 +177,16 @@ class LogicalStreamAllocator {
   LogicalStreamAllocator &operator=(const LogicalStreamAllocator &) = delete;
   ~LogicalStreamAllocator() = default;
 
-  Status Assign(const ComputeGraphPtr &whole_graph, const std::vector<SubGraphInfoPtr> &subgraphs, int64_t &stream_num);
+  Status Assign(const ComputeGraphPtr &whole_graph, const Graph2SubGraphInfoList &subgraph_map, int64_t &stream_num);
 
  private:
+  Status DoAssign(const ComputeGraphPtr &graph, const Graph2SubGraphInfoList &subgraph_map,
+                  const map<string, EngineConfPtr> &engine_confs);
   Status ConvertSubgraphs(const std::vector<SubGraphInfoPtr> &subgraph_infos,
                           const std::map<std::string, EngineConfPtr> &engine_confs,
                           std::vector<SubgraphPtr> &subgraphs);
-  Status RunPasses(const ComputeGraphPtr &whole_graph, const std::vector<SubgraphPtr> &subgraphs, int64_t &stream_num);
+  Status RunPasses(const ComputeGraphPtr &graph, const std::vector<SubgraphPtr> &subgraphs);
+  void RefreshContinuousStreams(const ComputeGraphPtr &graph);
 
   const std::map<std::string, SchedulerConf> &scheduler_confs_;
   const std::map<std::string, int> &max_parallel_num_;
diff --git a/src/ge/graph/build/memory/block_mem_assigner.cc b/src/ge/graph/build/memory/block_mem_assigner.cc
index 77860e4d..4f55a569 100644
--- a/src/ge/graph/build/memory/block_mem_assigner.cc
+++ b/src/ge/graph/build/memory/block_mem_assigner.cc
@@ -805,6 +805,9 @@ void SetOffsetSize(const NodeTypeIndex &node_type_index, int64_t offset, size_t
       }
     }
     op_desc->SetOutputOffset(output_list);
+    GELOGI("[IMAS]Set %s name[%s] output[%d] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu].",
+           graph_name.c_str(), op_desc->GetName().c_str(), node_type_index.index, offset, op_desc->GetStreamId(), size,
+           real_size);
   } else if (node_type_index.mem_type == kWorkspace) {
     vector<int64_t> workspace_list;
     workspace_list = op_desc->GetWorkspace();
@@ -821,6 +824,9 @@ void SetOffsetSize(const NodeTypeIndex &node_type_index, int64_t offset, size_t
       workspace_list.at(node_type_index.index) = offset;
     }
     op_desc->SetWorkspace(workspace_list);
+    GELOGI("[IMAS]Set %s name[%s] workspace[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu].",
+           graph_name.c_str(), op_desc->GetName().c_str(), node_type_index.index, offset, op_desc->GetStreamId(), size,
+           real_size);
   }
 }
 
diff --git a/src/ge/graph/build/memory/graph_mem_assigner.cc b/src/ge/graph/build/memory/graph_mem_assigner.cc
index 33e8fcad..bcae79ea 100644
--- a/src/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/src/ge/graph/build/memory/graph_mem_assigner.cc
@@ -310,6 +310,11 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node)
     if (is_tensor_actual_size == 0) {
       AlignMemOffset(MEM_ALIGN_SIZE);
     }
+    GELOGI(
+      "[IMAS]Continuous input : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%zu] "
+      "real_size[%ld].",
+      node->GetOwnerComputeGraph()->GetName().c_str(), peer_op_desc->GetName().c_str(), peer_out_data_anchor->GetIdx(),
+      pre_mem_offset, peer_op_desc->GetStreamId(), (memory_offset_[0].mem_offset_ - pre_mem_offset), tensor_desc_size);
   }
 
   return SUCCESS;
@@ -340,6 +345,11 @@ Status GraphMemoryAssigner::AssignContinuousOutputMemory(const ge::NodePtr &node
     memory_offset_[0].mem_offset_ += tensor_desc_size;
 
     AlignMemOffset(MEM_ALIGN_SIZE);
+    GELOGI(
+      "[IMAS]Continuous output : Set %s name[%s] output[%d] offset to [%zu] stream_id[%ld] size[%zu] "
+      "real_size[%ld].",
+      node->GetOwnerComputeGraph()->GetName().c_str(), out_op_desc->GetName().c_str(), out_data_anchor->GetIdx(),
+      pre_mem_offset, out_op_desc->GetStreamId(), (memory_offset_[0].mem_offset_ - pre_mem_offset), tensor_desc_size);
   }
 
   out_op_desc->SetOutputOffset(output_list);
@@ -413,8 +423,10 @@ Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousInputMemory() {
           pre_mem_offset, peer_op_desc->GetStreamId(), out_size, output_mem_size);
       }
       memory_offset_[0].mem_offset_ += extra_memory_size;
-      GELOGI("After reassign virtual input node[name:%s, type:%s] memory, memory offset = %zu.",
-             op_desc->GetName().c_str(), op_desc->GetType().c_str(), memory_offset_[0].mem_offset_);
+      size_t after_mem_offset = memory_offset_[0].mem_offset_;
+      AlignMemOffset(MEM_ALIGN_SIZE);
+      GELOGI("After reassign virtual input node[name:%s, type:%s] memory, memory offset = %zu, align memory = %zu.",
+             op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset, memory_offset_[0].mem_offset_);
     }
   }
   return SUCCESS;
@@ -499,8 +511,10 @@ Status GraphMemoryAssigner::ReAssignReuseAndNoPaddingContinuousOutputMemory() {
       }
       op_desc->SetOutputOffset(output_list);
       memory_offset_[0].mem_offset_ += extra_memory_size;
-      GELOGI("After reassign virtual output node[name:%s, type:%s] memory, memory offset = %zu.",
-             op_desc->GetName().c_str(), op_desc->GetType().c_str(), memory_offset_[0].mem_offset_);
+      size_t after_mem_offset = memory_offset_[0].mem_offset_;
+      AlignMemOffset(MEM_ALIGN_SIZE);
+      GELOGI("After reassign virtual output node[name:%s, type:%s] memory, memory offset = %zu, align memory = %zu.",
+             op_desc->GetName().c_str(), op_desc->GetType().c_str(), after_mem_offset, memory_offset_[0].mem_offset_);
     }
   }
   return SUCCESS;
@@ -567,6 +581,11 @@ Status GraphMemoryAssigner::ReAssignMergeMemory() {
 
       output_list[index] = data_output_offset;
       src_node->GetOpDesc()->SetOutputOffset(output_list);
+      GELOGI(
+        "[IMAS]ReAssignMergeMemory : Set %s name[%s] output[%d] offset to [%ld] stream_id[%ld] size[%ld] "
+        "real_size[%ld].",
+        n->GetOwnerComputeGraph()->GetName().c_str(), src_node->GetOpDesc()->GetName().c_str(), index,
+        data_output_offset, src_node->GetOpDesc()->GetStreamId(), max_output_size, max_output_size);
       input_list.emplace_back(data_output_offset);
     }
 
@@ -897,6 +916,9 @@ Status GraphMemoryAssigner::AssignAtomicOutputMemory(const ge::NodePtr &node) {
     }
 
     output_list[output_index] = memory_offset_[0].mem_offset_;
+    GELOGI("[IMAS]Atomic output : Set %s name[%s] output[%ld] offset to [%zu] stream_id[%ld] size[%ld] real_size[%ld].",
+           compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), output_index, memory_offset_[0].mem_offset_,
+           op_desc->GetStreamId(), size, size);
 
     memory_offset_[0].mem_offset_ += size;
     AlignMemOffset(MEM_ALIGN_SIZE);
@@ -933,6 +955,11 @@ Status GraphMemoryAssigner::AssignOrdinaryAtomicWorkspaceMemory(const ge::OpDesc
       }
 
       workspace_vector[workspace_index] = memory_offset_[0].mem_offset_;
+      GELOGI(
+        "[IMAS]Atomic ordinary workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] "
+        "size[%ld] real_size[%ld].",
+        compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index, memory_offset_[0].mem_offset_,
+        op_desc->GetStreamId(), workspace_size, workspace_size);
 
       memory_offset_[0].mem_offset_ += workspace_size;
     }
@@ -958,6 +985,11 @@ Status GraphMemoryAssigner::AssignFusionAtomicWorkspaceMemory(const ge::OpDescPt
       auto workspace_size = info_iter.second;
 
       size_t workspace_offset = memory_offset_[0].mem_offset_;
+      GELOGI(
+        "[IMAS]Atomic fusion workspace : Set %s name[%s] workspace[%lu] offset to [%zu] stream_id[%ld] size[%ld] "
+        "real_size[%ld].",
+        compute_graph_->GetName().c_str(), op_desc->GetName().c_str(), workspace_index, memory_offset_[0].mem_offset_,
+        op_desc->GetStreamId(), workspace_size, workspace_size);
 
       memory_offset_[0].mem_offset_ += workspace_size;
       index_offset.insert(std::make_pair(workspace_index, workspace_offset));
@@ -1005,7 +1037,8 @@ ge::Status GraphMemoryAssigner::SetInputOffset() {
     GELOGE(FAILED, "memory_offset_ is empty.");
     return FAILED;
   }
-  GEEVENT("[IMAS]AfterAssignMemory : %s", compute_graph_->GetName().c_str());
+  GEEVENT("[IMAS]AfterAssignMemory : %s memoffset[%zu]", compute_graph_->GetName().c_str(),
+          memory_offset_[0].mem_offset_);
   for (const ge::NodePtr &node : compute_graph_->GetDirectNode()) {
     if (UpdateOpInputOffset(node) != ge::SUCCESS) {
       GELOGE(ge::FAILED, "Update op input offset failed");
@@ -1166,6 +1199,12 @@ ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &n, int64_t ato
       GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_MEM_SIZE, mem_size_vector),
                        GELOGE(FAILED, "SetListInt failed.");
                        return FAILED);
+
+      GELOGI(
+        "[IMAS]SetAtomicCleanAttr : Set %s name[%s] output[%d] offset to [%ld] streamid[%ld] size[%ld] "
+        "realsize[%ld].",
+        node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(), 0, atomic_mem_start,
+        node->GetOpDesc()->GetStreamId(), atomic_mem_size, atomic_mem_size);
     }
   }
   return SUCCESS;
diff --git a/src/ge/graph/build/model_builder.cc b/src/ge/graph/build/model_builder.cc
index af641dcc..ac61eeeb 100644
--- a/src/ge/graph/build/model_builder.cc
+++ b/src/ge/graph/build/model_builder.cc
@@ -28,6 +28,7 @@
 #include "graph/common/omg_util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/ge_attr_value.h"
+#include "graph/ge_context.h"
 #include "graph/ge_error_codes.h"
 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/graph_var_manager.h"
@@ -39,7 +40,6 @@
 #include "graph/utils/op_desc_utils.h"
 #include "graph/utils/tensor_utils.h"
 #include "graph/utils/type_utils.h"
-#include "graph/ge_context.h"
 #include "init/gelib.h"
 #include "memory/memory_assigner.h"
 #include "omg/version.h"
@@ -78,15 +78,16 @@ bool IsGeLocalOp(const ge::ConstOpDescPtr &op_desc) {
     ge::GeTensorDesc output_desc = op_desc->GetOutputDesc(0);
     return !(output_desc.GetDataType() == ge::DT_STRING);
   }
-  const set<string> ge_local_set = {
-    ge::STREAMMERGE, ge::MEMCPYASYNC, ge::STREAMACTIVE, ge::STREAMSWITCH,  ge::VARIABLE,         ge::NOOP, ge::CONSTANT,
-    ge::ENTER,       ge::REFENTER,    ge::LOOPCOND,     ge::NEXTITERATION, ge::REFNEXTITERATION, ge::EXIT, ge::REFEXIT};
+  const set<string> ge_local_set = {ge::STREAMMERGE, ge::MEMCPYASYNC, ge::STREAMACTIVE,   ge::STREAMSWITCH,
+                                    ge::VARIABLE,    ge::NOOP,        ge::CONSTANT,       ge::ENTER,
+                                    ge::REFENTER,    ge::LOOPCOND,    ge::NEXTITERATION,  ge::REFNEXTITERATION,
+                                    ge::EXIT,        ge::REFEXIT,     ge::MEMCPYADDRASYNC};
   return (ge_local_set.find(type) != ge_local_set.end());
 }
 }  // namespace
 
 namespace ge {
-ModelBuilder::ModelBuilder(ge::ComputeGraphPtr compute_graph, const vector<SubGraphInfoPtr> &subgraphs,
+ModelBuilder::ModelBuilder(ge::ComputeGraphPtr compute_graph, const Graph2SubGraphInfoList &subgraphs,
                            const map<string, int> &stream_max_parallel_num, bool hcom_parallel, int mode)
     : mem_offset_(0),
       weight_offset_(kWeightsStartOffset),
@@ -225,6 +226,25 @@ Status ModelBuilder::SetInputOutputDesc() {
     if (!is_loop_graph_ && node_op_desc->GetType() == LOOPCOND) {
       is_loop_graph_ = true;
     }
+    // if user set input node format ND, the expected node for data and netoutput format is ND in
+    // final graph.
+    if ((domi::GetContext().format == domi::DOMI_TENSOR_ND) &&
+        ((node_op_desc->GetType() == DATA_TYPE) || (node_op_desc->GetType() == NETOUTPUT))) {
+      GELOGI("The node [%s] format should be set ND.", node_op_desc->GetName().c_str());
+      auto inputDescsPtr = node_op_desc->GetAllInputsDescPtr();
+      auto outputDescsPtr = node_op_desc->GetAllOutputsDescPtr();
+      ge::Format format = ge::FORMAT_ND;
+      for (auto &inputDescPtr : inputDescsPtr) {
+        GE_CHECK_NOTNULL(inputDescPtr);
+        inputDescPtr->SetFormat(format);
+        inputDescPtr->SetOriginFormat(format);
+      }
+      for (auto &outputDescPtr : outputDescsPtr) {
+        GE_CHECK_NOTNULL(outputDescPtr);
+        outputDescPtr->SetFormat(format);
+        outputDescPtr->SetOriginFormat(format);
+      }
+    }
 
     if (node_op_desc->GetType() == DATA_TYPE || node_op_desc->GetType() == AIPP_DATA_TYPE) {
       GELOGD("Data node: %s.", n->GetName().c_str());
diff --git a/src/ge/graph/build/model_builder.h b/src/ge/graph/build/model_builder.h
index 4bf03bdc..072126e3 100644
--- a/src/ge/graph/build/model_builder.h
+++ b/src/ge/graph/build/model_builder.h
@@ -37,7 +37,7 @@
 namespace ge {
 class ModelBuilder {
  public:
-  ModelBuilder(ge::ComputeGraphPtr whole_graph, const std::vector<SubGraphInfoPtr> &subgraphs,
+  ModelBuilder(ge::ComputeGraphPtr whole_graph, const Graph2SubGraphInfoList &subgraphs,
                const std::map<std::string, int> &stream_max_parallel_num, bool hcom_parallel,
                int mode = static_cast<int>(domi::BuildMode::GEN_TASK_WITHOUT_FUSION));
 
@@ -85,7 +85,7 @@ class ModelBuilder {
 
   ge::ComputeGraphPtr compute_graph_;
 
-  const std::vector<SubGraphInfoPtr> &subgraphs_;
+  const Graph2SubGraphInfoList &subgraphs_;
 
   int64_t stream_num_;
 
diff --git a/src/ge/graph/build/run_context.cc b/src/ge/graph/build/run_context.cc
index e3230f5e..f2a41271 100644
--- a/src/ge/graph/build/run_context.cc
+++ b/src/ge/graph/build/run_context.cc
@@ -164,6 +164,9 @@ Status RunContextUtil::CreateRunContext(Model &model, const ComputeGraphPtr &gra
     return ret;
   }
 
+  GELOGI("CreateRunContext: data_mem_base_ = %p, weight_mem_base_ = %p, memory_size = %lu, weight_size = %lu",
+         data_mem_base_, weight_mem_base_, data_mem_size_, weight_mem_size_);
+
   run_context_ = {rt_model_,        nullptr, session_id,   data_mem_size_, data_mem_base_, weight_mem_size_,
                   weight_mem_base_, buffer,  stream_list_, event_list_,    label_list_};
   return SUCCESS;
diff --git a/src/ge/graph/build/stream_allocator.cc b/src/ge/graph/build/stream_allocator.cc
index ffcc2315..88c5e055 100644
--- a/src/ge/graph/build/stream_allocator.cc
+++ b/src/ge/graph/build/stream_allocator.cc
@@ -40,7 +40,7 @@ const uint32_t kMaxSwitchStreamNum = 1;
 
 namespace ge {
 Status StreamAllocator::AssignLogicalStreams(const std::map<std::string, int> &max_parallel_num, bool hcom_parallel) {
-  GELOGI("AssignLogicalStreams start.");
+  GELOGI("Assign logical streams start.");
   GE_CHECK_NOTNULL(whole_graph_);
   GraphUtils::DumpGEGraph(whole_graph_, "BeforeAssignedLogicalStreams");
   GraphUtils::DumpGEGraphToOnnx(*whole_graph_, "BeforeAssignedLogicalStreams");
@@ -52,7 +52,6 @@ Status StreamAllocator::AssignLogicalStreams(const std::map<std::string, int> &m
   }
 
   const map<string, SchedulerConf> &scheduler_confs = gelib->DNNEngineManagerObj().GetSchedulers();
-
   LogicalStreamAllocator logical_allocator(scheduler_confs, max_parallel_num, hcom_parallel);
   Status status = logical_allocator.Assign(whole_graph_, subgraphs_, stream_num_);
   if (status != SUCCESS) {
@@ -62,7 +61,7 @@ Status StreamAllocator::AssignLogicalStreams(const std::map<std::string, int> &m
 
   GraphUtils::DumpGEGraph(whole_graph_, "AfterAssignedLogicalStreams");
   GraphUtils::DumpGEGraphToOnnx(*whole_graph_, "AfterAssignedLogicalStreams");
-  GELOGI("AssignLogicalStreams success.");
+  GELOGI("Assign logical streams success.");
 
   return SUCCESS;
 }
@@ -136,7 +135,7 @@ Status StreamAllocator::RefreshRealStream(int64_t &stream_num, int64_t &event_nu
     GELOGI("None of nodes need to assign stream, stream num is 0, it will cause error, so change it to 1");
     stream_num_ = 1;
   }
-  GELOGI("stream_num_: %ld, event_num_: %u.", stream_num_, event_num_);
+  GELOGI("stream num: %ld, event num: %u.", stream_num_, event_num_);
   GELOGI("RefreshRealStream successfully.");
 
   stream_num = stream_num_;
@@ -148,7 +147,7 @@ Status StreamAllocator::RefreshRealStream(int64_t &stream_num, int64_t &event_nu
 // Split the stream according to the maximum number of nodes in the stream.
 Status StreamAllocator::SplitStreams() {
   if (stream_num_ == 0) {
-    GELOGI("stream_num_ is 0");
+    GELOGI("The number of streams is 0 and no need to split.");
     return SUCCESS;
   }
 
diff --git a/src/ge/graph/build/stream_allocator.h b/src/ge/graph/build/stream_allocator.h
index e3901205..a18e00d7 100644
--- a/src/ge/graph/build/stream_allocator.h
+++ b/src/ge/graph/build/stream_allocator.h
@@ -30,7 +30,7 @@
 namespace ge {
 class StreamAllocator {
  public:
-  StreamAllocator(ComputeGraphPtr whole_graph, const std::vector<SubGraphInfoPtr> &subgraphs)
+  StreamAllocator(ComputeGraphPtr whole_graph, const Graph2SubGraphInfoList &subgraphs)
       : whole_graph_(std::move(whole_graph)), subgraphs_(subgraphs) {}
   StreamAllocator(const StreamAllocator &) = delete;
   StreamAllocator &operator=(const StreamAllocator &) = delete;
@@ -75,7 +75,7 @@ class StreamAllocator {
   bool IsRecvNodeActivatedBySendNode(const NodePtr &send_node_ptr, const NodePtr &recv_node_ptr) const;
 
   ComputeGraphPtr whole_graph_;
-  const std::vector<SubGraphInfoPtr> &subgraphs_;
+  const Graph2SubGraphInfoList &subgraphs_;
 
   int64_t stream_num_{0};
   uint32_t event_num_{0};
diff --git a/src/ge/graph/build/stream_graph_optimizer.cc b/src/ge/graph/build/stream_graph_optimizer.cc
index 6e0211de..42d1afc1 100644
--- a/src/ge/graph/build/stream_graph_optimizer.cc
+++ b/src/ge/graph/build/stream_graph_optimizer.cc
@@ -29,19 +29,21 @@ static const int64_t kInvalidStream = -1;
 namespace ge {
 StreamGraphOptimizer::~StreamGraphOptimizer() {}
 
-void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, vector<SubGraphInfoPtr> &subgraph_infos) {
+void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map) {
   size_t node_size = comp_graph->GetDirectNodesSize();
   GELOGI("Refresh placeholder and end nodeId start from node num: %zu", node_size);
-  for (const auto &sub_graph_info : subgraph_infos) {
-    ComputeGraphPtr sub_graph = sub_graph_info->GetSubGraph();
-    if (sub_graph == nullptr) {
-      continue;
-    }
-    for (ge::NodePtr &node : sub_graph->GetDirectNode()) {
-      GE_CHECK_NOTNULL_EXEC(node->GetOpDesc(), return );
-      if ((node->GetType() == END) || (node->GetType() == PLACEHOLDER)) {
-        node->GetOpDesc()->SetId(static_cast<int64_t>(node_size));
-        node_size++;
+  for (const auto &subgraph_pair : subgraph_map) {
+    for (const auto &subgraph_info : subgraph_pair.second) {
+      ComputeGraphPtr subgraph = subgraph_info->GetSubGraph();
+      if (subgraph == nullptr) {
+        continue;
+      }
+      for (ge::NodePtr &node : subgraph->GetDirectNode()) {
+        GE_CHECK_NOTNULL_EXEC(node->GetOpDesc(), return );
+        if ((node->GetType() == END) || (node->GetType() == PLACEHOLDER)) {
+          node->GetOpDesc()->SetId(static_cast<int64_t>(node_size));
+          node_size++;
+        }
       }
     }
   }
@@ -71,67 +73,71 @@ bool StreamGraphOptimizer::IsSameStreamId(const ComputeGraphPtr &comp_graph) {
 }
 
 Status StreamGraphOptimizer::OptimizeStreamedSubGraph(const ComputeGraphPtr &comp_graph,
-                                                      vector<SubGraphInfoPtr> &subgraph_infos,
+                                                      Graph2SubGraphInfoList &subgraph_map,
                                                       struct RunContext &run_context) {
-  Status ret = SUCCESS;
-  GELOGI("Begin to Get optimize streamed subgraph.");
+  GELOGI("Optimize streamed subgraph start.");
 
-  RefreshNodeId(comp_graph, subgraph_infos);
+  RefreshNodeId(comp_graph, subgraph_map);
 
   std::shared_ptr<GELib> instance = ge::GELib::GetInstance();
   GE_CHECK_NOTNULL(instance);
 
-  for (auto &sub_graph_info : subgraph_infos) {
-    ComputeGraphPtr sub_graph = sub_graph_info->GetSubGraph();
-    if (sub_graph == nullptr) {
-      continue;
-    }
+  for (const auto &subgraph_pair : subgraph_map) {
+    for (const auto &subgraph_info : subgraph_pair.second) {
+      ComputeGraphPtr subgraph = subgraph_info->GetSubGraph();
+      GE_CHECK_NOTNULL(subgraph);
 
-    std::string engine_name = sub_graph_info->GetEngineName();
+      GELOGI("Optimize subgraph %s", subgraph->GetName().c_str());
 
-    vector<GraphOptimizerPtr> graph_optimizers;
-    if (instance->DNNEngineManagerObj().IsEngineRegistered(engine_name)) {
-      instance->OpsKernelManagerObj().GetGraphOptimizerByEngine(engine_name, graph_optimizers);
-      GELOGI("Subgraph: %s start optimize streamed graph. engineName: %s, subgraph num: %zu, graph Optimizer num: %zu.",
-             sub_graph->GetName().c_str(), engine_name.c_str(), subgraph_infos.size(), graph_optimizers.size());
+      std::string engine_name = subgraph_info->GetEngineName();
 
-      auto nodes = sub_graph->GetDirectNode();
-      if (nodes.empty()) {
-        continue;
-      }
-      if (!IsSameStreamId(sub_graph)) {
-        GELOGI("There are more than one stream in subgraph %s", sub_graph->GetName().c_str());
-        continue;
-      }
-      OpDescPtr op_desc = nodes.at(0)->GetOpDesc();
-      GE_CHECK_NOTNULL(op_desc);
-      int64_t stream_id = op_desc->GetStreamId();
-      if (static_cast<size_t>(stream_id) >= run_context.graphStreamList.size()) {
-        GELOGE(FAILED, "stream_id is bigger than run_context.graphStreamList.size()");
-        return FAILED;
-      }
-      run_context.stream = run_context.graphStreamList[stream_id];
-      GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu.",
-             sub_graph->GetName().c_str(), engine_name.c_str(), stream_id,
-             static_cast<uint64_t>(reinterpret_cast<uintptr_t>(run_context.stream)));
-      for (auto iter = graph_optimizers.begin(); iter != graph_optimizers.end(); ++iter) {
-        GE_CHECK_NOTNULL(*iter);
-        ret = (*iter)->OptimizeStreamGraph(*sub_graph, run_context);
-        if (ret != SUCCESS) {
-          GELOGE(ret,
-                 "[optimizeStreamedSubGraph]: optimize streamed subgraph failed, subgraph: %s, engine_name: %s, graph "
-                 "Optimizer num: %zu, ret: %u",
-                 sub_graph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size(), ret);
-          return ret;
+      vector<GraphOptimizerPtr> graph_optimizers;
+      if (instance->DNNEngineManagerObj().IsEngineRegistered(engine_name)) {
+        instance->OpsKernelManagerObj().GetGraphOptimizerByEngine(engine_name, graph_optimizers);
+        GELOGI("Subgraph: %s start optimize streamed graph. engineName: %s, graph Optimizer num: %zu.",
+               subgraph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size());
+
+        auto nodes = subgraph->GetDirectNode();
+        if (nodes.empty()) {
+          continue;
+        }
+        if (!IsSameStreamId(subgraph)) {
+          GELOGI("There are more than one stream in subgraph %s", subgraph->GetName().c_str());
+          continue;
+        }
+        OpDescPtr op_desc = nodes.at(0)->GetOpDesc();
+        GE_CHECK_NOTNULL(op_desc);
+        int64_t stream_id = op_desc->GetStreamId();
+        if (static_cast<size_t>(stream_id) >= run_context.graphStreamList.size()) {
+          GELOGE(FAILED, "stream_id %ld is bigger than run_context.graphStreamList.size() %zu", stream_id,
+                 run_context.graphStreamList.size());
+          return FAILED;
+        }
+        run_context.stream = run_context.graphStreamList[stream_id];
+        GELOGD("Subgraph has same stream id, subgraph: %s, engine_name: %s, stream_id: %ld, rtstream: %lu.",
+               subgraph->GetName().c_str(), engine_name.c_str(), stream_id,
+               static_cast<uint64_t>(reinterpret_cast<uintptr_t>(run_context.stream)));
+        for (auto iter = graph_optimizers.begin(); iter != graph_optimizers.end(); ++iter) {
+          GE_CHECK_NOTNULL(*iter);
+          Status ret = (*iter)->OptimizeStreamGraph(*subgraph, run_context);
+          if (ret != SUCCESS) {
+            GELOGE(
+              ret,
+              "[optimizeStreamedSubGraph]: optimize streamed subgraph failed, subgraph: %s, engine_name: %s, graph "
+              "Optimizer num: %zu, ret: %u",
+              subgraph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size(), ret);
+            return ret;
+          }
+          GELOGI(
+            "[optimizeStreamedSubGraph]: optimize streamed subgraph success, subgraph: %s, engine_name: %s, graph "
+            "Optimizer num: %zu!",
+            subgraph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size());
         }
-        GELOGI(
-          "[optimizeStreamedSubGraph]: optimize streamed subgraph success, subgraph: %s, engine_name: %s, graph "
-          "Optimizer num: %zu!",
-          sub_graph->GetName().c_str(), engine_name.c_str(), graph_optimizers.size());
       }
     }
   }
 
-  return ret;
+  GELOGI("Optimize streamed subgraph success.");
+  return SUCCESS;
 }
 }  // namespace ge
diff --git a/src/ge/graph/build/stream_graph_optimizer.h b/src/ge/graph/build/stream_graph_optimizer.h
index a65f95f2..3133d32d 100644
--- a/src/ge/graph/build/stream_graph_optimizer.h
+++ b/src/ge/graph/build/stream_graph_optimizer.h
@@ -35,11 +35,11 @@ class StreamGraphOptimizer {
 
   virtual ~StreamGraphOptimizer();
 
-  Status OptimizeStreamedSubGraph(const ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
+  Status OptimizeStreamedSubGraph(const ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map,
                                   struct RunContext &run_context);
 
  private:
-  void RefreshNodeId(const ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list);
+  void RefreshNodeId(const ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map);
 
   bool IsSameStreamId(const ComputeGraphPtr &comp_graph);
 };
diff --git a/src/ge/graph/build/task_generator.cc b/src/ge/graph/build/task_generator.cc
index e8f6dd26..cc34e352 100644
--- a/src/ge/graph/build/task_generator.cc
+++ b/src/ge/graph/build/task_generator.cc
@@ -221,10 +221,8 @@ Status TaskGenerator::SaveL1fusionNodes(map<int64_t, std::vector<NodePtr>> &l1_f
     if (call_check) {
       auto input_group_id = *input_group_ids.begin();
       if (group_id != input_group_id) {
-        GELOGE(INTERNAL_ERROR,
-               "L1Fusion: node[name:%s(%s) with group id:%ld and diff from it's input nodes's group id:%ld ",
+        GELOGW("L1Fusion: node[name:%s(%s) with group id:%ld and diff from it's input nodes's group id:%ld ",
                name.c_str(), type.c_str(), group_id, input_group_id);
-        return INTERNAL_ERROR;
       }
     }
   }
diff --git a/src/ge/graph/label/label_maker.cc b/src/ge/graph/label/label_maker.cc
index 9ab6824c..bf8949f0 100644
--- a/src/ge/graph/label/label_maker.cc
+++ b/src/ge/graph/label/label_maker.cc
@@ -172,7 +172,7 @@ NodePtr LabelMaker::AddLabelSetLeave(const ComputeGraphPtr &graph, const std::st
 
   GELOGI("LabelSet: Create node %s.", op_desc->GetName().c_str());
   (void)AttrUtils::SetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, index);
-  NodePtr label_set = graph->AddNodeFront(op_desc);
+  NodePtr label_set = graph->AddNode(op_desc);
   GE_CHECK_NOTNULL_EXEC(label_set, return nullptr);
 
   // Link control edge to graph tail.
@@ -202,7 +202,7 @@ NodePtr LabelMaker::AddLabelGotoEnter(const ComputeGraphPtr &graph, const std::s
     return nullptr;
   }
 
-  OpDescPtr op_desc = MakeShared<OpDesc>(name, LABELGOTO);
+  OpDescPtr op_desc = MakeShared<OpDesc>(name, LABELGOTOEX);
   GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr);
   SetStreamIdEnter(graph, op_desc);
 
@@ -238,7 +238,7 @@ NodePtr LabelMaker::AddLabelGotoLeave(const ComputeGraphPtr &graph, const std::s
   const NodePtr &node = *it;
   GE_CHECK_NOTNULL_EXEC(node, return nullptr);
 
-  OpDescPtr op_desc = MakeShared<OpDesc>(name, LABELGOTO);
+  OpDescPtr op_desc = MakeShared<OpDesc>(name, LABELGOTOEX);
   GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr);
   SetStreamIdLeave(graph, op_desc);
 
@@ -366,6 +366,7 @@ NodePtr LabelMaker::AddLabelSwitchIndex(const ComputeGraphPtr &graph, const std:
 
   OpDescPtr op_desc = MakeShared<OpDesc>(name, DATA);
   GE_CHECK_NOTNULL_EXEC(op_desc, return nullptr);
+  op_desc->SetStreamId(kInvalidStreamId);
 
   GELOGI("Data: Create node %s.", op_desc->GetName().c_str());
   if (op_desc->AddOutputDesc(desc) != GRAPH_SUCCESS) {
diff --git a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc
index c3de44c9..9b3c7a0f 100644
--- a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc
+++ b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.cc
@@ -20,11 +20,11 @@
 namespace {
 const uint32_t kCoreDim = 1;  // for rtCpuKernelLaunch
 const char *const kCpuTaskModelEnqueue = "modelEnqueue";
-const char *const kCpuTaskPrepareInput = "modelPrepareInput";
 const char *const kCpuTaskWaitEndGraph = "modelWaitEndGraph";
-const char *const kCpuTaskPrepareOutput = "modelPrepareOutput";
+const char *const kCpuTaskPrepareOutput = "bufferPrepareOutput";
 const char *const kCpuTaskModelDequeue = "modelDequeue";
 const char *const kCpuTaskModelRepeat = "modelRepeat";
+const char *const kCpuTaskZeroCopy = "zeroCpy";
 }  // namespace
 
 namespace ge {
@@ -93,19 +93,19 @@ Status CpuTaskModelDequeue::Distribute() {
 
 ///
 /// @ingroup ge
-/// @brief definiteness queue schedule, bind output queue to task.
-/// @param [in] addr: NetOutput Op input tensor address.
-/// @param [in] size: NetOutput Op input tensor size.
-/// @param [in] in_mbuf: input mbuf addr for input data.
+/// @brief definiteness queue schedule, zero copy.
+/// @param [in] mbuf_list: input/output mbuf addr list for input/output data.
+/// @param [in] outside_addrs: model input/output memory addr
 /// @return: 0 for success / others for failed
 ///
-Status CpuTaskPrepareInput::Init(uintptr_t addr, uint32_t size, uintptr_t in_mbuf) {
+Status CpuTaskZeroCopy::Init(std::vector<uintptr_t> &mbuf_list,
+                             std::map<const void *, std::vector<void *>> &outside_addrs) {
   if ((args_ != nullptr) || (args_size_ > 0)) {
     GELOGE(FAILED, "Task already initialized, size: %u", args_size_);
     return FAILED;
   }
 
-  args_size_ = sizeof(PrepareInputInfo);
+  args_size_ = sizeof(AddrMapInfo);
   rtError_t status = rtMalloc(&args_, args_size_, RT_MEMORY_HBM);
   if (status != RT_ERROR_NONE) {
     GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
@@ -113,36 +113,99 @@ Status CpuTaskPrepareInput::Init(uintptr_t addr, uint32_t size, uintptr_t in_mbu
   }
   GE_PRINT_DYNAMIC_MEMORY(rtMalloc, "args data.", args_size_)
 
-  PrepareInputInfo prepare;
-  prepare.in_mbuf = in_mbuf;
-  prepare.mbuf_offset = 0;
-  prepare.data_size = size;
-  prepare.data_addr = addr;
-  status = rtMemcpy(args_, args_size_, &prepare, args_size_, RT_MEMCPY_HOST_TO_DEVICE);
+  AddrMapInfo addr_map_info;
+  for (const auto &addrs : outside_addrs) {
+    addr_map_info.addr_num += addrs.second.size();
+  }
+  GELOGI("addr_map_info.addr_num is %zu", addr_map_info.addr_num);
+
+  // init src_addrs/dst_addrs
+  size_t index = 0;
+  vector<uint64_t> src_addrs;
+  vector<uint64_t> dst_addrs;
+  for (const auto &addrs : outside_addrs) {
+    for (size_t i = 0; i < addrs.second.size(); ++i) {
+      src_addrs.push_back(mbuf_list.at(index));
+      dst_addrs.push_back(reinterpret_cast<uint64_t>(addrs.second.at(i)));
+    }
+    index++;
+  }
+
+  // malloc mem for src_addrs/dst_addrs, and copy data of src_addrs/dst_addrs
+  status = rtMalloc(&src_addr_, src_addrs.size() * sizeof(uint64_t), RT_MEMORY_HBM);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
+    return RT_FAILED;
+  }
+  status = rtMemcpy(src_addr_, src_addrs.size() * sizeof(uint64_t), src_addrs.data(),
+                    src_addrs.size() * sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE);
   if (status != RT_ERROR_NONE) {
     GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
     return RT_FAILED;
   }
 
+  status = rtMalloc(&dst_addr_, dst_addrs.size() * sizeof(uint64_t), RT_MEMORY_HBM);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt malloc failed, status: 0x%x", status);
+    return RT_FAILED;
+  }
+  status = rtMemcpy(dst_addr_, dst_addrs.size() * sizeof(uint64_t), dst_addrs.data(),
+                    dst_addrs.size() * sizeof(uint64_t), RT_MEMCPY_HOST_TO_DEVICE);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
+    return RT_FAILED;
+  }
+
+  // src_addr_list is init to src_addr, which is the point to src_addrs
+  if (!src_addrs.empty() && !dst_addrs.empty()) {
+    addr_map_info.src_addr_list = reinterpret_cast<uint64_t>(src_addr_);
+    addr_map_info.dst_addr_list = reinterpret_cast<uint64_t>(dst_addr_);
+    GELOGI("src_addr_list is %lu, dst_addr_list is %lu", addr_map_info.src_addr_list, addr_map_info.dst_addr_list);
+  }
+
+  status = rtMemcpy(args_, args_size_, &addr_map_info, sizeof(AddrMapInfo), RT_MEMCPY_HOST_TO_DEVICE);
+  if (status != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt memcpy failed, status: 0x%x", status);
+    return RT_FAILED;
+  }
   return SUCCESS;
 }
 
-Status CpuTaskPrepareInput::Distribute() {
+Status CpuTaskZeroCopy::Distribute() {
   if ((args_ == nullptr) || (args_size_ == 0) || (stream_ == nullptr)) {
     GELOGE(FAILED, "Task not initialized, distribute failed, size: %u", args_size_);
     return FAILED;
   }
 
-  rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskPrepareInput, kCoreDim, args_, args_size_, nullptr, stream_);
+  rtError_t status = rtCpuKernelLaunch(nullptr, kCpuTaskZeroCopy, kCoreDim, args_, args_size_, nullptr, stream_);
   if (status != RT_ERROR_NONE) {
-    GELOGE(RT_FAILED, "Call rt CpuKernelLaunch PrepareInput failed, status: 0x%X", status);
+    GELOGE(RT_FAILED, "Call rt CpuKernelLaunch ZeroCopy failed, status: 0x%X", status);
     return RT_FAILED;
   }
 
-  GELOGI("Cpu kernel launch prepare input task success.");
+  GELOGI("Cpu kernel launch zero copy task success.");
   return SUCCESS;
 }
 
+CpuTaskZeroCopy::~CpuTaskZeroCopy() {
+  if (src_addr_ == nullptr && dst_addr_ == nullptr) {
+    return;
+  }
+  if (src_addr_ != nullptr) {
+    rtError_t status = rtFree(src_addr_);
+    if (status != RT_ERROR_NONE) {
+      GELOGW("Call rt free failed, status: 0x%x", status);
+    }
+  }
+  if (dst_addr_ != nullptr) {
+    rtError_t status = rtFree(dst_addr_);
+    if (status != RT_ERROR_NONE) {
+      GELOGW("Call rt free failed, status: 0x%x", status);
+    }
+  }
+  src_addr_ = nullptr;
+  dst_addr_ = nullptr;
+}
 ///
 /// @ingroup ge
 /// @brief definiteness queue schedule, bind output queue to task.
diff --git a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h
index 8a9af63f..c4ae4df5 100644
--- a/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h
+++ b/src/ge/graph/load/new_model_manager/cpu_queue_schedule.h
@@ -47,6 +47,13 @@ struct PrepareOutputInfo {
   uintptr_t out_mbuf;   // output mbuf addr
 };
 
+// For AICPU task "modelZeroCopy"
+struct AddrMapInfo {
+  uint32_t addr_num = 0;
+  uint64_t src_addr_list;
+  uint64_t dst_addr_list;
+};
+
 ///
 /// @ingroup ge
 /// @brief CpuTask base, inherit from TaskInfo used for manage.
@@ -78,17 +85,21 @@ class CpuTaskModelDequeue : public CpuTaskInfo {
 
 ///
 /// @ingroup ge
-/// @brief definiteness queue schedule, bind output queue to task.
+/// @brief definiteness queue schedule, zero copy.
 ///
-class CpuTaskPrepareInput : public CpuTaskInfo {
+class CpuTaskZeroCopy : public CpuTaskInfo {
  public:
-  explicit CpuTaskPrepareInput(rtStream_t stream) : CpuTaskInfo(stream) {}
-  ~CpuTaskPrepareInput() override {}
+  explicit CpuTaskZeroCopy(rtStream_t stream) : CpuTaskInfo(stream) {}
+  ~CpuTaskZeroCopy() override;
 
   Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override { return SUCCESS; }
-  Status Init(uintptr_t addr, uint32_t size, uintptr_t in_mbuf);
+  Status Init(std::vector<uintptr_t> &mbuf_list, std::map<const void *, std::vector<void *>> &outside_addrs);
 
   Status Distribute() override;
+
+ private:
+  void *src_addr_ = nullptr;
+  void *dst_addr_ = nullptr;
 };
 
 ///
diff --git a/src/ge/graph/load/new_model_manager/davinci_model.cc b/src/ge/graph/load/new_model_manager/davinci_model.cc
index 7b743f3c..19c0ab16 100644
--- a/src/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/src/ge/graph/load/new_model_manager/davinci_model.cc
@@ -78,6 +78,7 @@ namespace {
 const uint32_t kDataIndex = 0;
 const uint32_t kTrueBranchStreamNum = 1;
 const uint32_t kThreadNum = 16;
+const uint32_t kAddrLen = sizeof(void *);
 const int kDecimal = 10;
 const int kBytes = 8;
 const uint32_t kDataMemAlignSizeCompare = 64;
@@ -100,17 +101,20 @@ class RtContextSwitchGuard {
 
     ret = rtCtxSetCurrent(current_);
     if (ret != RT_ERROR_NONE) {
-      GELOGE(RT_FAILED, "Failed to switch context to normal, device %u", device_id);
+      GELOGE(RT_FAILED, "Failed to switch context to normal, context %p, device %u", current_, device_id);
       return;
     }
+    GELOGD("Create and switch rt context %p type %d for device %u, backup last %p.", current_, mode, device_id, last_);
   }
 
   ~RtContextSwitchGuard() {
     if (current_ != nullptr) {
       auto ret = rtCtxDestroy(current_);
+      GELOGD("Destory current context %p result %d", current_, ret);
     }
     if (last_ != nullptr) {
       auto ret = rtCtxSetCurrent(last_);
+      GELOGD("Recovery last context %p result %d.", last_, ret);
     }
   }
 
@@ -149,7 +153,10 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt
 
   uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM);
   if (var_addr == nullptr) {
-    GELOGE(INTERNAL_ERROR, "Failed to copy var %s from device, cant not get var addr", var->GetName().c_str());
+    GELOGE(INTERNAL_ERROR,
+           "Failed to copy var %s from device, cant not get "
+           "var addr from logic addr %p",
+           var->GetName().c_str(), var_logic);
     return INTERNAL_ERROR;
   }
 
@@ -177,6 +184,8 @@ Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_pt
   GELOGD("Copy var %s from device to host, size %ld", var->GetName().c_str(), var_size_bytes);
   var_data.swap(var_host);
 
+  GELOGI("var_logic:%p, var_addr:%p", var_logic, var_addr);
+
   return SUCCESS;
 }
 
@@ -230,7 +239,7 @@ Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats
       }
     } else if (trans_info.node_type == CAST) {
       auto input_shape = trans_info.input.GetShape();
-      auto src_data_size = input_shape.GetShapeSize();
+      auto src_data_size = input_shape.GetShapeSize() == 0 ? 1 : input_shape.GetShapeSize();
       auto src_data_type = trans_info.input.GetDataType();
       auto dst_data_type = trans_info.output.GetDataType();
       GELOGD("Trans data type from %s to %s, input shape %s, data size %ld",
@@ -284,6 +293,8 @@ Status ReAssignVarAddr(uint64_t session_id, const std::string &var_name, const G
   }
   *var_device = var_addr;
 
+  GELOGI("var_logic:%p, var_addr:%p", var_logic, var_addr);
+
   return SUCCESS;
 }
 
@@ -399,10 +410,10 @@ DavinciModel::DavinciModel(int32_t priority, const std::shared_ptr<ModelListener
       rt_model_handle_(nullptr),
       rt_model_stream_(nullptr),
       is_inner_model_stream_(false),
+      is_async_mode_(false),
       support_mem_shared_flag_(false),
       session_id_(0),
       device_id_(0),
-      is_train_mode_(false),
       model_task_def_(nullptr),
       maxDumpOpNum_(0),
       input_use_zero_copy_(true),
@@ -531,6 +542,8 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p
     if (mem_base_ == nullptr) {
       return FAILED;
     }
+    GELOGI("[IMAS]InitModelMem graph_%u MallocMemory type[F] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
+           mem_base_, data_size);
 
     weights_mem_base_ = mem_base_;
 
@@ -548,6 +561,8 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p
       }
       is_inner_weight_base_ = true;
     }
+    GELOGI("[IMAS]InitModelMem graph_%u MallocMemory type[W] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
+           weights_mem_base_, weights_size);
     GE_CHK_RT_RET(rtMemcpy(weights_mem_base_, weights_size, weights_addr, weights_size, RT_MEMCPY_HOST_TO_DEVICE))
     GELOGI("copy weights data to device");
   }
@@ -560,6 +575,8 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p
       return ret;
     }
     var_mem_base_ = VarManager::Instance(session_id_)->GetVarMemoryBase(RT_MEMORY_HBM);
+    GELOGI("[IMAS]InitModelMem graph_%u MallocMemory type[V] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
+           var_mem_base_, TotalVarMemSize());
   }
 
   runtime_param_.mem_base = mem_base_;
@@ -631,19 +648,16 @@ Status DavinciModel::DoTaskSink() {
   if (model_task_def_) {
     GELOGI("do task_sink.");
 
-    // will adjust stream indication, load fist.
+    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(InitTaskInfo(*model_task_def_.get()) != SUCCESS, return FAILED,
+                                   "InitTaskInfo failed.");
     GE_CHK_STATUS_RET(LoadWithQueue(), "LoadWithQueue failed.");
-
+    // will adjust stream indication, load fist.
     for (size_t i = 0; i < stream_list_.size(); i++) {
       GE_IF_BOOL_EXEC(active_stream_indication_.count(i) > 0, GELOGI("rtModelBindStream[%zu]", i);
                       GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, stream_list_[i], RT_INVALID_FLAG)); continue;);
       // bind rt_model_handel to all streams that relates to op
-      GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, stream_list_[i], 0));
+      GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, stream_list_[i], RT_HEAD_STREAM));
     }
-
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(InitTaskInfo(*model_task_def_.get()) != SUCCESS, return FAILED,
-                                   "InitTaskInfo failed.");
-
     GE_CHK_STATUS_RET(DistributeTask(), "Distribute failed.");
 
     GE_CHK_RT_RET(rtModelLoadComplete(rt_model_handle_));
@@ -715,6 +729,7 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
 
     GE_DISMISS_GUARD(stream);
     stream_list_.push_back(stream);
+    GELOGD("Stream index:%u, stream:%p.", i, stream);
   }
 
   for (uint32_t i = 0; i < EventNum(); i++) {
@@ -723,12 +738,7 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
     event_list_.push_back(rt_event);
   }
 
-  for (uint32_t i = 0; i < LabelNum(); i++) {
-    rtLabel_t rt_label;
-    GE_CHK_RT_RET(rtLabelCreate(&rt_label));
-    GE_CHK_BOOL_RET_STATUS(rt_label != nullptr, FAILED, "rt_label is nullptr.");
-    label_list_.push_back(rt_label);
-  }
+  label_list_.resize(LabelNum(), nullptr);
 
   // create model_handle to load model
   GE_CHK_RT_RET(rtModelCreate(&rt_model_handle_, 0));
@@ -803,11 +813,17 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
 ///
 Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
   uint32_t data_op_index = 0;
-  std::map<uint32_t, std::pair<int64_t, void *>> input_data_info;
-
   GE_TIMESTAMP_CALLNUM_START(LoadTBEKernelBinToOpDesc);
   GE_TIMESTAMP_CALLNUM_START(InitTbeHandle);
 
+  typedef Status (DavinciModel::*OpDescCall)(const OpDescPtr &);
+  static std::map<std::string, OpDescCall> op_desc_handle = {
+    {VARIABLE, &DavinciModel::InitVariable},           {CONSTANTOP, &DavinciModel::InitConstant},
+    {NETOUTPUT, &DavinciModel::InitNetOutput},         {ENDGRAPH, &DavinciModel::InitEndGraph},
+    {STREAMACTIVE, &DavinciModel::InitStreamActive},   {STREAMSWITCH, &DavinciModel::InitStreamSwitch},
+    {STREAMSWITCHN, &DavinciModel::InitStreamSwitchN}, {LABELSET, &DavinciModel::InitLabelSet},
+  };
+
   auto nodes = compute_graph->GetAllNodes();
   const TBEKernelStore &tbekernel_store = ge_model_->GetTBEKernelStore();
   for (size_t i = 0; i < nodes.size(); i++) {
@@ -825,7 +841,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
     GE_TIMESTAMP_ADD(LoadTBEKernelBinToOpDesc);
 
     if (IsDataOp(op_desc->GetType())) {
-      if (InitDataOp(node, data_op_index, input_data_info) != SUCCESS) {
+      if (InitDataOp(node, data_op_index) != SUCCESS) {
         GELOGE(PARAM_INVALID, "Data init failed, Name: %s", op_desc->GetName().c_str());
         return PARAM_INVALID;
       }
@@ -839,32 +855,15 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
       continue;
     }
 
-    if (op_desc->GetType() == VARIABLE) {
-      variable_op_list_.push_back(op_desc);
-      continue;
-    }
-
-    if (op_desc->GetType() == NETOUTPUT) {
-      if (InitNetOutput(op_desc) != SUCCESS) {
+    auto it = op_desc_handle.find(op_desc->GetType());
+    if (it != op_desc_handle.end()) {
+      if ((this->*it->second)(op_desc) != SUCCESS) {
         GELOGE(PARAM_INVALID, "NetOutput init failed, Name: %s", op_desc->GetName().c_str());
         return PARAM_INVALID;
       }
       continue;
     }
 
-    // Initialize constant op, only applies to training, ignoring inference constant op
-    if (op_desc->GetType() == CONSTANTOP) {
-      if (InitConstant(op_desc) != SUCCESS) {
-        GELOGE(PARAM_INVALID, "Constant init failed. %s", op_desc->GetName().c_str());
-        return PARAM_INVALID;
-      }
-      continue;
-    }
-
-    if (op_desc->GetType() == ENDGRAPH) {
-      end_graph_op_ = op_desc;
-    }
-
     GE_TIMESTAMP_RESTART(InitTbeHandle);
     uint32_t run_mode = static_cast<uint32_t>(domi::ImplyType::INVALID);
     if (AttrUtils::GetInt(op_desc, ATTR_NAME_IMPLY_TYPE, run_mode) &&
@@ -883,17 +882,11 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
       }
     }
     GE_TIMESTAMP_ADD(InitTbeHandle);
-
-    if (MarkActiveStream(op_desc) != SUCCESS) {
-      GELOGE(PARAM_INVALID, "MarkActiveStream failed, node:%s, opIndex:%zu", op_desc->GetName().c_str(), i);
-      return PARAM_INVALID;
-    }
   }
 
-  Status ret = CombineDataInfo(input_data_info);
   GE_TIMESTAMP_CALLNUM_END(LoadTBEKernelBinToOpDesc, "GraphLoader::LoadTBEKernelBinToOpDesc.");
   GE_TIMESTAMP_CALLNUM_END(InitTbeHandle, "GraphLoader::InitTbeHandle.");
-  return ret;
+  return SUCCESS;
 }
 
 /// @ingroup ge
@@ -902,8 +895,7 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
 /// @param [in/out] data_op_index: NetOutput addr size info.
 /// @param [in/out] input_data_info: Data index and addr info {index, {size, addr}}.
 /// @return Status
-Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index,
-                                std::map<uint32_t, std::pair<int64_t, void *>> &input_data_info) {
+Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index) {
   // op_desc Checked by Init: Data, valid.
   auto op_desc = node->GetOpDesc();
   uint32_t parent_index = 0;  // Ignore subgraph Data Node.
@@ -925,20 +917,20 @@ Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index,
 
   // Make information for copy input data.
   const vector<int64_t> output_size_list = ModelUtils::GetOutputSize(op_desc);
-  const vector<void *> output_addr_list = ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc);
-  if (output_size_list.empty() || output_addr_list.empty() || (output_size_list.size() != output_addr_list.size())) {
+  const vector<void *> virtual_addr_list = ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc, false);
+  if (output_size_list.empty() || virtual_addr_list.empty() || (output_size_list.size() != virtual_addr_list.size())) {
     GELOGE(PARAM_INVALID, "Data[%s] init failed: Output size is %zu, Output addr is %zu", op_desc->GetName().c_str(),
-           output_size_list.size(), output_addr_list.size());
+           output_size_list.size(), virtual_addr_list.size());
     return PARAM_INVALID;
   }
 
   auto data_index = data_op_index;
   if (AttrUtils::GetInt(op_desc, ATTR_NAME_INDEX, data_index)) {
-    GELOGI("ge_train:get new index %u, old %u", data_index, data_op_index);
+    GELOGI("ge_train: get new index %u, old %u", data_index, data_op_index);
   }
 
-  input_data_info[data_index] = {output_size_list[kDataIndex], output_addr_list[kDataIndex]};
-  SetInputOutsideAddr(output_addr_list);
+  input_data_info_[data_index] = {output_size_list[kDataIndex], virtual_addr_list[kDataIndex]};
+  SetInputOutsideAddr(virtual_addr_list);
   data_op_index++;
   if (InitInputZeroCopyNodes(node) != SUCCESS) {
     GELOGE(PARAM_INVALID, "Input zero copy nodes init failed!");
@@ -1001,43 +993,78 @@ Status DavinciModel::InitNetOutput(const OpDescPtr &op_desc) {
 
   // Make information for copy output data.
   const vector<int64_t> input_size_list = ModelUtils::GetInputSize(op_desc);
-  const vector<void *> input_addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc);
-  if (input_size_list.empty() && input_addr_list.empty()) {
+  const vector<void *> virtual_addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc, false);
+  if (input_size_list.empty() && virtual_addr_list.empty()) {
     GELOGI("NetOutput[%s] is empty.", op_desc->GetName().c_str());
     return SUCCESS;
   }
-  if (input_size_list.empty() || input_size_list.size() != input_addr_list.size() ||
+  if (input_size_list.empty() || input_size_list.size() != virtual_addr_list.size() ||
       input_size_list.size() != output_size_list.size()) {
     GELOGE(PARAM_INVALID, "NetOutput[%s] init failed: Input size is %zu, Input addr is %zu, Output size is %zu",
-           op_desc->GetName().c_str(), input_size_list.size(), input_addr_list.size(), output_size_list.size());
+           op_desc->GetName().c_str(), input_size_list.size(), virtual_addr_list.size(), output_size_list.size());
     return PARAM_INVALID;
   }
 
-  output_size_list_.insert(output_size_list_.end(), input_size_list.begin(), input_size_list.end());
-  output_addr_list_.insert(output_addr_list_.end(), input_addr_list.begin(), input_addr_list.end());
-  SetOutputOutsideAddr(input_addr_list);
+  size_t num = output_data_info_.size();
+  for (size_t idx = 0; idx < input_size_list.size(); ++idx) {
+    output_data_info_[num + idx] = {input_size_list[idx], virtual_addr_list[idx]};
+  }
+
+  SetOutputOutsideAddr(virtual_addr_list);
   return SUCCESS;
 }
 
 /// @ingroup ge
-/// @brief Make Input and Output addr for feature use.
-/// @param [in] input_data_info: Data index and addr info {index, {size, addr}}.
+/// @brief LabelSet Op Initialize.
+/// @param [in] op_desc: LabelSet Op descriptor.
 /// @return Status
-Status DavinciModel::CombineDataInfo(const std::map<uint32_t, std::pair<int64_t, void *>> &input_data_info) {
-  input_size_list_.resize(data_op_list_.size());
-  input_addr_list_.resize(data_op_list_.size());
-  for (size_t index = 0; index < data_op_list_.size(); ++index) {
-    auto it = input_data_info.find(index);
-    if (it == input_data_info.end()) {
-      GELOGE(PARAM_INVALID, "Data init failed: index %zu, Data Op size is %zu, Input addr is %zu", index,
-             data_op_list_.size(), input_data_info.size());
-      return INTERNAL_ERROR;
-    }
-    input_size_list_[index] = it->second.first;
-    input_addr_list_[index] = it->second.second;
+Status DavinciModel::InitLabelSet(const OpDescPtr &op_desc) {
+  uint32_t label_index = 0;
+  if (!AttrUtils::GetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, label_index)) {
+    GELOGE(INTERNAL_ERROR, "InitLabelSet: %s attr [%s] not exist.", op_desc->GetName().c_str(),
+           ATTR_NAME_LABEL_SWITCH_INDEX.c_str());
+    return INTERNAL_ERROR;
   }
+  if (label_index >= LabelNum()) {
+    GELOGE(INTERNAL_ERROR, "InitLabelSet: label index: %u >= label size: %zu.", label_index, LabelNum());
+    return INTERNAL_ERROR;
+  }
+  if (label_id_indication_.count(label_index) > 0) {
+    GELOGE(INTERNAL_ERROR, "InitLabelSet: %s label index: %u already used.", op_desc->GetName().c_str(), label_index);
+    return INTERNAL_ERROR;
+  }
+
+  rtStream_t stream = nullptr;
+  uint32_t stream_id = static_cast<uint32_t>(op_desc->GetStreamId());
+  if (stream_list_.size() == 1) {
+    stream = stream_list_[0];
+  } else if (stream_list_.size() > stream_id) {
+    stream = stream_list_[stream_id];
+  } else {
+    GELOGE(INTERNAL_ERROR, "InitLabelSet: stream index: %u >= stream size: %zu.", stream_id, stream_list_.size());
+    return INTERNAL_ERROR;
+  }
+
+  rtLabel_t rt_label = nullptr;
+  rtError_t rt_error = rtLabelCreate(&rt_label);
+  if (rt_error != RT_ERROR_NONE || rt_label == nullptr) {
+    GELOGE(INTERNAL_ERROR, "InitLabelSet: %s create label failed, error=0x%x.", op_desc->GetName().c_str(), rt_error);
+    return INTERNAL_ERROR;
+  }
+
+  GELOGI("InitLabelSet: label[%u]=%p stream[%u]=%p.", label_index, rt_label, stream_id, stream);
+  label_id_indication_.insert(label_index);
+  label_list_[label_index] = rt_label;
+  return SUCCESS;
+}
 
-  GELOGI("Data init success, input size %zu, output size %zu", input_size_list_.size(), output_size_list_.size());
+Status DavinciModel::InitVariable(const OpDescPtr &op_desc) {
+  variable_op_list_.push_back(op_desc);
+  return SUCCESS;
+}
+
+Status DavinciModel::InitEndGraph(const OpDescPtr &op_desc) {
+  end_graph_op_ = op_desc;
   return SUCCESS;
 }
 
@@ -1070,31 +1097,34 @@ Status DavinciModel::LoadWithQueue() {
     return SUCCESS;
   }
 
-  if (input_queue_ids_.size() != data_op_list_.size()) {
+  if (input_queue_ids_.size() != input_data_info_.size()) {
     GELOGE(PARAM_INVALID, "Input queue ids not match model: input_queue=%zu input_data=%zu", input_queue_ids_.size(),
-           data_op_list_.size());
+           input_data_info_.size());
     return PARAM_INVALID;
   }
 
-  if (output_queue_ids_.size() != output_size_list_.size()) {
+  if (output_queue_ids_.size() != output_data_info_.size()) {
     GELOGE(PARAM_INVALID, "Output queue ids not match model: output_queue=%zu output_data=%zu",
-           output_queue_ids_.size(), output_size_list_.size());
+           output_queue_ids_.size(), output_data_info_.size());
     return PARAM_INVALID;
   }
 
   // create stream instance which rt_model_handel is running on, this is S0.
   GE_CHK_RT_RET(rtStreamCreateWithFlags(&rt_model_stream_, priority_, RT_STREAM_AICPU));
   is_inner_model_stream_ = true;
-  GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, rt_model_stream_, 0));
+  GE_CHK_RT_RET(rtModelBindStream(rt_model_handle_, rt_model_stream_, RT_HEAD_STREAM));
 
   // Binding input_queue and Data Op.
   GE_CHK_STATUS_RET(BindInputQueue(), "Launch bind input queue failed.");
-
-  GE_CHK_STATUS_RET(BindActiveStream(), "Launch active entry stream failed.");
-  GE_CHK_STATUS_RET(CpuWaitEndGraph(), "Launch wait end graph failed.");
+  GE_CHK_STATUS_RET(CpuTaskModelZeroCopy(input_mbuf_list_, input_outside_addrs_), "Launch zero copy failed.");
 
   // Binding output_queue and NetOutput Op.
   GE_CHK_STATUS_RET(BindOutputQueue(), "Launch bind output queue failed.");
+  GE_CHK_STATUS_RET(CpuTaskModelZeroCopy(output_mbuf_list_, output_outside_addrs_), "Launch zero copy failed.");
+
+  GE_CHK_STATUS_RET(BindActiveStream(), "Launch active entry stream failed.");
+  GE_CHK_STATUS_RET(CpuWaitEndGraph(), "Launch wait end graph failed.");
+  GE_CHK_STATUS_RET(BindEnqueue(), "Launch enqueue failed.")
   GE_CHK_STATUS_RET(CpuModelRepeat(), "Launch model repeat failed.");
 
   return SUCCESS;
@@ -1106,9 +1136,15 @@ Status DavinciModel::LoadWithQueue() {
 Status DavinciModel::BindInputQueue() {
   // Caller checked: input_queue_ids_.size() == input_size_list_.size() != input_addr_list_.size()
   for (size_t i = 0; i < input_queue_ids_.size(); ++i) {
+    auto it = input_data_info_.find(i);
+    if (it == input_data_info_.end()) {
+      GELOGE(FAILED, "Input not match: tensor num=%zu, Queue id index=%zu", input_data_info_.size(), i);
+      return FAILED;
+    }
+
     uint32_t queue_id = input_queue_ids_[i];
-    uint32_t data_size = input_size_list_[i];
-    uintptr_t data_addr = reinterpret_cast<uintptr_t>(input_addr_list_[i]);
+    uint32_t data_size = static_cast<uint32_t>(it->second.first);
+    uintptr_t data_addr = reinterpret_cast<uintptr_t>(it->second.second);
     GELOGI("BindInputToQueue: graph_%u index[%zu] queue id[%u] output addr[0x%lx] output size[%u]",
            runtime_param_.graph_id, i, queue_id, data_addr, data_size);
 
@@ -1116,7 +1152,7 @@ Status DavinciModel::BindInputQueue() {
       return INTERNAL_ERROR;
     }
 
-    if (CpuModelDequeue(queue_id, data_addr, data_size) != SUCCESS) {
+    if (CpuModelDequeue(queue_id) != SUCCESS) {
       return INTERNAL_ERROR;
     }
   }
@@ -1125,57 +1161,12 @@ Status DavinciModel::BindInputQueue() {
 }
 
 /// @ingroup ge
-/// @brief queue schedule, bind output queue to NetOutput input address.
-/// @return: 0 for success / others for failed
-Status DavinciModel::BindOutputQueue() {
-  // Caller checked: input_queue_ids_.size() == input_size_list_.size() != input_addr_list_.size()
-  for (size_t i = 0; i < output_queue_ids_.size(); ++i) {
-    uint32_t queue_id = output_queue_ids_[i];
-    uint32_t data_size = output_size_list_[i];
-    uintptr_t data_addr = reinterpret_cast<uintptr_t>(output_addr_list_[i]);
-    GELOGI("BindOutputToQueue: graph_%u index[%zu] queue id[%u] input addr[0x%lx] input size[%u]",
-           runtime_param_.graph_id, i, queue_id, data_addr, data_size);
-
-    if (rtModelBindQueue(rt_model_handle_, queue_id, RT_MODEL_OUTPUT_QUEUE) != RT_ERROR_NONE) {
-      return INTERNAL_ERROR;
-    }
-
-    if (CpuModelEnqueue(queue_id, data_addr, data_size) != SUCCESS) {
-      return INTERNAL_ERROR;
-    }
-  }
-
-  return SUCCESS;
-}
-
-/// @ingroup ge
-/// @brief queue schedule, active stream will schedule by S0.
-/// @return: 0 for success / others for failed
-Status DavinciModel::BindActiveStream() {
-  // Stream not in active_stream_indication_ is active stream.
-  std::vector<rtStream_t> active_stream_list;
-  for (size_t i = 0; i < stream_list_.size(); ++i) {
-    if (active_stream_indication_.count(i) == 0) {
-      active_stream_list.push_back(stream_list_[i]);
-      active_stream_indication_.insert(i);  // deactive all model stream.
-    }
-  }
-
-  // Active stream add to active entry, will active by S0.
-  if (CpuActiveStream(active_stream_list) != SUCCESS) {
-    return INTERNAL_ERROR;
-  }
-
-  return SUCCESS;
-}
-
-/// @ingroup ge
 /// @brief definiteness queue schedule, bind input queue to task.
 /// @param [in] queue_id: input queue id from user.
 /// @param [in] addr: Data Op output tensor address.
 /// @param [in] size: Data Op output tensor size.
 /// @return: 0 for success / others for failed
-Status DavinciModel::CpuModelDequeue(uint32_t queue_id, uintptr_t addr, uint32_t size) {
+Status DavinciModel::CpuModelDequeue(uint32_t queue_id) {
   GELOGI("Set CpuKernel model dequeue task enter.");
   std::shared_ptr<CpuTaskModelDequeue> dequeue_task = MakeShared<CpuTaskModelDequeue>(rt_model_stream_);
   if (dequeue_task == nullptr) {
@@ -1189,20 +1180,55 @@ Status DavinciModel::CpuModelDequeue(uint32_t queue_id, uintptr_t addr, uint32_t
     return FAILED;
   }
 
-  std::shared_ptr<CpuTaskPrepareInput> prepare_input = MakeShared<CpuTaskPrepareInput>(rt_model_stream_);
-  if (dequeue_task == nullptr) {
-    GELOGE(FAILED, "Make CpuTaskPrepareInput task failed.");
+  cpu_task_list_.push_back(dequeue_task);
+  input_mbuf_list_.push_back(in_mbuf);
+  GELOGI("Set CpuKernel model dequeue task success.");
+  return SUCCESS;
+}
+
+Status DavinciModel::CpuTaskModelZeroCopy(std::vector<uintptr_t> &mbuf_list,
+                                          std::map<const void *, std::vector<void *>> &outside_addrs) {
+  GELOGI("Set CpuKernel model zero_copy task enter.");
+  std::shared_ptr<CpuTaskZeroCopy> zero_copy = MakeShared<CpuTaskZeroCopy>(rt_model_stream_);
+  if (zero_copy == nullptr) {
+    GELOGE(FAILED, "Make CpuTaskZeroCopy task failed.");
     return FAILED;
   }
 
-  if (prepare_input->Init(addr, size, in_mbuf) != SUCCESS) {
+  if (zero_copy->Init(mbuf_list, outside_addrs) != SUCCESS) {
     return FAILED;
   }
+  cpu_task_list_.push_back(zero_copy);
+  GELOGI("Set CpuKernel model zero_copy task success.");
+  return SUCCESS;
+}
+
+/// @ingroup ge
+/// @brief queue schedule, bind output queue to NetOutput input address.
+/// @return: 0 for success / others for failed
+Status DavinciModel::BindOutputQueue() {
+  // Caller checked: input_queue_ids_.size() == input_size_list_.size() != input_addr_list_.size()
+  for (size_t i = 0; i < output_queue_ids_.size(); ++i) {
+    auto it = output_data_info_.find(i);
+    if (it == output_data_info_.end()) {
+      GELOGE(FAILED, "Output not match: tensor num=%zu, Queue id index=%zu", output_data_info_.size(), i);
+      return FAILED;
+    }
+
+    uint32_t queue_id = output_queue_ids_[i];
+    uint32_t data_size = static_cast<uint32_t>(it->second.first);
+    uintptr_t data_addr = reinterpret_cast<uintptr_t>(it->second.second);
+    GELOGI("BindOutputToQueue: graph_%u index[%zu] queue id[%u] input addr[0x%lx] input size[%u]",
+           runtime_param_.graph_id, i, queue_id, data_addr, data_size);
+
+    if (rtModelBindQueue(rt_model_handle_, queue_id, RT_MODEL_OUTPUT_QUEUE) != RT_ERROR_NONE) {
+      return INTERNAL_ERROR;
+    }
+    if (CpuModelPrepareOutput(data_addr, data_size) != SUCCESS) {
+      return INTERNAL_ERROR;
+    }
+  }
 
-  cpu_task_list_.push_back(dequeue_task);
-  cpu_task_list_.push_back(prepare_input);
-  input_mbuf_list_.push_back(in_mbuf);
-  GELOGI("Set CpuKernel model dequeue task success.");
   return SUCCESS;
 }
 
@@ -1212,7 +1238,7 @@ Status DavinciModel::CpuModelDequeue(uint32_t queue_id, uintptr_t addr, uint32_t
 /// @param [in] addr: NetOutput Op input tensor address.
 /// @param [in] size: NetOutput Op input tensor size.
 /// @return: 0 for success / others for failed
-Status DavinciModel::CpuModelEnqueue(uint32_t queue_id, uintptr_t addr, uint32_t size) {
+Status DavinciModel::CpuModelPrepareOutput(uintptr_t addr, uint32_t size) {
   GELOGI("Set CpuKernel model enqueue task enter.");
   if (input_mbuf_list_.empty()) {
     GELOGE(FAILED, "Need input mbuf for fill output mbuf head info.");
@@ -1230,20 +1256,30 @@ Status DavinciModel::CpuModelEnqueue(uint32_t queue_id, uintptr_t addr, uint32_t
     return FAILED;
   }
 
-  std::shared_ptr<CpuTaskModelEnqueue> model_enqueue = MakeShared<CpuTaskModelEnqueue>(rt_model_stream_);
-  if (model_enqueue == nullptr) {
-    GELOGE(FAILED, "Make CpuTaskModelEnqueue task failed.");
-    return FAILED;
+  cpu_task_list_.push_back(prepare_output);
+  output_mbuf_list_.push_back(out_mbuf);
+  GELOGI("Set CpuKernel model enqueue task success.");
+  return SUCCESS;
+}
+
+/// @ingroup ge
+/// @brief queue schedule, active stream will schedule by S0.
+/// @return: 0 for success / others for failed
+Status DavinciModel::BindActiveStream() {
+  // Stream not in active_stream_indication_ is active stream.
+  std::vector<rtStream_t> active_stream_list;
+  for (size_t i = 0; i < stream_list_.size(); ++i) {
+    if (active_stream_indication_.count(i) == 0) {
+      active_stream_list.push_back(stream_list_[i]);
+      active_stream_indication_.insert(i);  // deactive all model stream.
+    }
   }
 
-  if (model_enqueue->Init(queue_id, out_mbuf) != SUCCESS) {
-    return FAILED;
+  // Active stream add to active entry, will active by S0.
+  if (CpuActiveStream(active_stream_list) != SUCCESS) {
+    return INTERNAL_ERROR;
   }
 
-  cpu_task_list_.push_back(prepare_output);
-  cpu_task_list_.push_back(model_enqueue);
-  output_mbuf_list_.push_back(out_mbuf);
-  GELOGI("Set CpuKernel model enqueue task success.");
   return SUCCESS;
 }
 
@@ -1293,6 +1329,38 @@ Status DavinciModel::CpuWaitEndGraph() {
   return SUCCESS;
 }
 
+Status DavinciModel::BindEnqueue() {
+  for (size_t i = 0; i < output_queue_ids_.size(); ++i) {
+    auto it = output_data_info_.find(i);
+    if (it == output_data_info_.end()) {
+      GELOGE(FAILED, "Output not match: tensor num=%zu, Queue id index=%zu", output_data_info_.size(), i);
+      return FAILED;
+    }
+
+    uint32_t queue_id = output_queue_ids_[i];
+    if (CpuModelEnqueue(queue_id, output_mbuf_list_[i]) != SUCCESS) {
+      return INTERNAL_ERROR;
+    }
+  }
+  return SUCCESS;
+}
+
+Status DavinciModel::CpuModelEnqueue(uint32_t queue_id, uintptr_t out_mbuf) {
+  GELOGI("Set CpuKernel model enqueue task enter.");
+  std::shared_ptr<CpuTaskModelEnqueue> model_enqueue = MakeShared<CpuTaskModelEnqueue>(rt_model_stream_);
+  if (model_enqueue == nullptr) {
+    GELOGE(FAILED, "Make CpuTaskModelEnqueue task failed.");
+    return FAILED;
+  }
+
+  if (model_enqueue->Init(queue_id, out_mbuf) != SUCCESS) {
+    return FAILED;
+  }
+  cpu_task_list_.push_back(model_enqueue);
+  GELOGI("Set CpuKernel model enqueue task enter.");
+  return SUCCESS;
+}
+
 /// @ingroup ge
 /// @brief definiteness queue schedule, repeat run model.
 /// @return: 0 for success / others for failed
@@ -1589,17 +1657,35 @@ ge::Format DavinciModel::GetFormat() {
 }
 
 Status DavinciModel::CopyInputData(const InputData &current_data, bool device_data) {
-  Status ret = SUCCESS;
-  uint32_t data_op_index = 0;
+  rtMemcpyKind_t kind = device_data ? RT_MEMCPY_DEVICE_TO_DEVICE : RT_MEMCPY_HOST_TO_DEVICE;
+  const std::vector<DataBuffer> &blobs = current_data.blobs;
+  for (const auto &data : input_data_info_) {
+    if (data.first >= blobs.size()) {
+      GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u, size=%ld", blobs.size(),
+             input_data_info_.size(), data.first, data.second.first);
+      return FAILED;
+    }
 
-  for (auto op_desc : data_op_list_) {
-    ret = CopyInputDataToModel(current_data.blobs, data_op_index, device_data);
+    const DataBuffer &data_buf = blobs[data.first];
+    // if data attr support zero copy, then update addrs info to flowtable
+    bool flag = data_buf.isDataSupportMemShare && support_mem_shared_flag_;
+    if (flag) {
+      GELOGI("No need to copy input data, user's input data buffer can be shared.");
+      continue;
+    }
+
+    void *mem_addr = data.second.second;
+    uint32_t mem_size = static_cast<uint32_t>(data.second.first);
+    GE_CHK_BOOL_RET_STATUS(mem_size >= data_buf.length, PARAM_INVALID,
+                           "input data size(%u) does not match model required size(%u), ret failed.", data_buf.length,
+                           mem_size);
 
-    GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "Copy input data to model ret failed, index:%u, model id:%u",
-                     current_data.index, current_data.model_id);
-    data_op_index++;
+    GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] output[%u] memaddr[%p] mem_size[%u] datasize[%u]",
+           runtime_param_.graph_id, data.first, mem_addr, mem_size, data_buf.length);
+    GE_CHK_RT_RET(rtMemcpy(mem_addr, mem_size, data_buf.data, data_buf.length, kind));
   }
-  return ret;
+
+  return SUCCESS;
 }
 
 Status DavinciModel::SyncVarData() {
@@ -1917,134 +2003,6 @@ void DavinciModel::SetProfileTime(ModelProcStage stage, int64_t endTime) {
   }
   return;
 }
-///
-/// @ingroup domi_ome
-/// @brief copy input data to Model's firat OP. Address already malloced when Load
-/// @copy need datatype transfer: FLOAT to FP16, 4D to 5D;
-/// @param [in] data data pointer to be copy
-/// @return Status result
-/// @author
-///
-Status DavinciModel::CopyInputDataToModel(const std::vector<DataBuffer> &data, uint32_t data_op_index,
-                                          bool device_data) {
-  GE_CHK_BOOL_RET_STATUS(!data_op_list_.empty(), PARAM_INVALID, "data_op_list_ is empty!");
-
-  GE_CHK_BOOL_RET_STATUS(data_op_list_.size() == data.size(), PARAM_INVALID,
-                         "The input data list size (%zu) does not match the model input list size (%zu)", data.size(),
-                         data_op_list_.size());
-
-  GE_CHK_BOOL_RET_STATUS(data_op_index < data_op_list_.size(), PARAM_INVALID,
-                         "input data op index(%zu) is invalid, exceeds input op size(%zu)", data_op_index,
-                         data_op_list_.size());
-
-  /// input datatype conversion, converting FLOAT to FP16, 4D to 5D at the same time.
-  /// Choose respective mode in API parameters.
-  auto op_def = data_op_list_[data_op_index];
-  GE_CHK_BOOL_EXEC(op_def != nullptr, return PARAM_INVALID, "op_def is null!");
-
-  auto data_index = data_op_index;
-  if (AttrUtils::GetInt(op_def, "index", data_index)) {
-    GELOGI("ge_train:get new index %u , old %u", data_index, data_op_index);
-  }
-
-  GE_CHK_BOOL_EXEC(data_index < data.size(), return PARAM_INVALID, "index:%u >= size:%zu", data_index, data.size());
-  GE_CHK_BOOL_RET_STATUS(op_def->GetInputsSize() == 1 && op_def->GetOutputsSize() == 1, PARAM_INVALID,
-                         "Data Op has invalid input_desc_size(%zu) or output_desc_size(%zu)", op_def->GetInputsSize(),
-                         op_def->GetOutputsSize());
-
-  // float to float16
-  bool need_trans_flag = ModelUtils::IsInputTensorNeedTrans(data_op_list_[data_op_index], 0);
-
-  int64_t output_size = 0;
-  GE_CHK_STATUS(TensorUtils::GetSize(*op_def->GetOutputDescPtr(0), output_size), "get output size failed.");
-  GE_CHK_BOOL_RET_STATUS(output_size >= data[data_index].length, PARAM_INVALID,
-                         "input data size(%u) does not match model required size(%zu), ret failed.",
-                         data[data_index].length, output_size);
-
-  vector<GeAttrValue::INT> outputs = op_def->GetOutputOffset();
-  if (device_data) {
-    return CopyPlainData(data, data_index, data_op_index, outputs, RT_MEMCPY_DEVICE_TO_DEVICE);
-  } else if (need_trans_flag) {
-    return CopyTransData(data, data_index, data_op_index, outputs);
-  } else {
-    return CopyPlainData(data, data_index, data_op_index, outputs, RT_MEMCPY_HOST_TO_DEVICE);
-  }
-}
-
-Status DavinciModel::CopyTransData(const std::vector<DataBuffer> &data, uint32_t data_index, uint32_t data_op_index,
-                                   const std::vector<GeAttrValue::INT> &outputs) {
-  GE_CHECK_VECTOR_NOT_EMPTY(outputs);
-  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(outputs[0] == -1, return PARAM_INVALID, "output offset is -1");
-  GE_CHK_BOOL_EXEC(data_index < data.size(), return PARAM_INVALID, "index:%u >= size:%zu", data_index, data.size());
-
-  auto input_tensor_desc = data_op_input_tensor_desc_map_[data_op_list_[data_op_index]->GetName()];
-  auto output_tensor_desc = data_op_output_tensor_desc_map_[data_op_list_[data_op_index]->GetName()];
-
-  uint8_t *src_data = reinterpret_cast<uint8_t *>(data[data_index].data);
-
-  formats::TransResult tmp_result{};
-  auto input_shape = input_tensor_desc->GetShape();
-  auto src_data_size = input_shape.GetShapeSize();
-  auto src_data_type = input_tensor_desc->GetDataType();
-  auto dst_data_type = output_tensor_desc->GetDataType();
-  GELOGD("Trans data type from %s to %s, input shape %s, data size %zu",
-         TypeUtils::DataTypeToSerialString(src_data_type).c_str(),
-         TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(),
-         src_data_size);
-  auto ret =
-    formats::TransDataType({src_data, static_cast<size_t>(src_data_size), src_data_type, dst_data_type}, tmp_result);
-  if (ret != SUCCESS) {
-    GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %zu, error code %u",
-           TypeUtils::DataTypeToSerialString(src_data_type).c_str(),
-           TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(),
-           src_data_size, ret);
-    return ret;
-  }
-
-  void *mem_addr = mem_base_ + outputs[0];
-  auto rt_ret = rtMemcpy(mem_addr, static_cast<int64_t>(runtime_param_.mem_size - outputs[0]),
-                         reinterpret_cast<void *>(tmp_result.data.get()), static_cast<int64_t>(tmp_result.length),
-                         RT_MEMCPY_HOST_TO_DEVICE);
-  if (rt_ret != RT_ERROR_NONE) {
-    GELOGE(RT_FAILED, "Failed to copy memory to device, size %zu", tmp_result.length);
-    return RT_FAILED;
-  }
-  GELOGI("[IMAS]CopyTransData memcpy graph_%u type[F] name[%s] output[%d] memaddr[%p] datasize[%zu]",
-         runtime_param_.graph_id, data_op_list_[data_op_index]->GetName().c_str(), 0, mem_addr, tmp_result.length);
-  return SUCCESS;
-}
-
-Status DavinciModel::CopyPlainData(const std::vector<DataBuffer> &data, uint32_t data_index, uint32_t data_op_index,
-                                   const std::vector<GeAttrValue::INT> &outputs, rtMemcpyKind_t kind) {
-  GE_CHK_BOOL_EXEC(data_index < data.size(), return PARAM_INVALID, "index:%u >= size:%zu", data_index, data.size());
-  bool flag = data[data_index].isDataSupportMemShare && support_mem_shared_flag_;
-  // if data attr support zero cpy,then update addrs info to flowtable
-  if (flag) {
-    GELOGI("No need to copy input data, user's input data buffer can be shared.");
-    return SUCCESS;
-  }
-
-  GE_CHECK_VECTOR_NOT_EMPTY(outputs);
-  // P2P memory space parameters
-  void *host_data_addr = data[data_index].data;
-  uint32_t copy_size = data[data_index].length;
-  GELOGD("data output tensor is aipp tensor,copy data only.");
-
-  void *data_out_addr = nullptr;
-  if (VarManager::Instance(session_id_)->IsVarAddr(outputs[0])) {
-    data_out_addr = var_mem_base_ + outputs[0] - runtime_param_.logic_var_base;
-  } else {
-    data_out_addr = mem_base_ + outputs[0];
-    GELOGI("output[0]=%ld, copy_size=%u, total_size=%zu", outputs[0], copy_size, TotalMemSize());
-
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(((uint64_t)outputs[0] + (uint64_t)copy_size) > TotalMemSize(), return INTERNAL_ERROR,
-                                   "input offset add size is large than total memory.");
-  }
-
-  GE_CHK_RT_RET(rtMemcpy(data_out_addr, copy_size, host_data_addr, copy_size, kind));
-
-  return SUCCESS;
-}
 
 ///
 /// @ingroup domi_ome
@@ -2061,9 +2019,9 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data) {
   } else {
     output_data.index = data_id;
     output_data.model_id = model_id_;
-    GE_CHK_BOOL_RET_STATUS(output_data.blobs.size() == output_size_list_.size(), INTERNAL_ERROR,
+    GE_CHK_BOOL_RET_STATUS(output_data.blobs.size() == output_data_info_.size(), INTERNAL_ERROR,
                            "output buffer size[%zu] not equal output_size_list[%zu] size!", output_data.blobs.size(),
-                           output_size_list_.size());
+                           output_data_info_.size());
 
     // index of data in output_data
     uint32_t output_data_index = 0;
@@ -2100,6 +2058,9 @@ Status DavinciModel::CopyOutputDataToUser(OpDescPtr &op_desc, std::vector<DataBu
     GE_CHK_BOOL_RET_STATUS(size <= v_output_size[i], PARAM_INVALID,
                            "Model output data size(%u) does not match required size(%u).", v_output_size[i],
                            data_buf.length);
+
+    GELOGI("CopyOutputDataToUser memcpy graph_%u type[F] name[%s] output[%lu] memaddr[%p] mem_size[%u] datasize[%u]",
+           runtime_param_.graph_id, op_desc->GetName().c_str(), i, data_buf.data, data_buf.length, v_output_size[i]);
     GE_CHK_RT_RET(rtMemcpy(data_buf.data, size, v_output_data_addr[i], size, RT_MEMCPY_DEVICE_TO_DEVICE));
   }
 
@@ -2417,7 +2378,6 @@ void *DavinciModel::Run(DavinciModel *model) {
 
   CsaInteract::GetInstance().WriteInternalErrorCode();
   GELOGI("Model run end, model id:%u", model->model_id_);
-  GEEVENT("Model Run thread end, model_id:%u.", model->model_id_);
   return nullptr;
 }
 
@@ -2778,20 +2738,20 @@ bool DavinciModel::CheckInputAndModelSize(const int64_t &input_size, const int64
 ///
 /// @ingroup ge
 /// @brief Copy Inputs and Outputs addr to model for direct use.
-/// @param [in] const domi::InputData &input_data: model input data.
-/// @param [in] domi::OutputData &output_data: model output data.
+/// @param [in] const InputData &input_data: model input data.
+/// @param [in] OutputData &output_data: model output data.
 /// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input
 /// @return SUCCESS handle successfully / PARAM_INVALID for failed
 ///
 Status DavinciModel::CopyModelData(const InputData &input_data, OutputData &output_data, bool is_dynamic_input) {
-  if (ZeroCopyBlobs(input_addr_list_, input_size_list_, input_data.blobs, is_dynamic_input, kInputZeroCopy,
-                    input_data.batch_label) != SUCCESS) {
+  if (ZeroCopyBlobs(input_data_info_, input_data.blobs, is_dynamic_input, kInputZeroCopy, input_data.batch_label) !=
+      SUCCESS) {
     GELOGE(PARAM_INVALID, "Copy input data to model failed.");
     return PARAM_INVALID;
   }
 
-  if (ZeroCopyBlobs(output_addr_list_, output_size_list_, output_data.blobs, is_dynamic_input, kOutputZeroCopy,
-                    input_data.batch_label) != SUCCESS) {
+  if (ZeroCopyBlobs(output_data_info_, output_data.blobs, is_dynamic_input, kOutputZeroCopy, input_data.batch_label) !=
+      SUCCESS) {
     GELOGE(PARAM_INVALID, "Copy output data to model failed.");
     return PARAM_INVALID;
   }
@@ -2804,31 +2764,37 @@ Status DavinciModel::CopyModelData(const InputData &input_data, OutputData &outp
 ///
 /// @ingroup ge
 /// @brief Copy Data addr to model for direct use.
-/// @param [in] const vector<void *> &addrs: model input memory addr list.
-/// @param [in] const vector<uint32_t> &sizes: model input memory size list.
+/// @param [in] const vstd::map<uint32_t, std::pair<int64_t, void *>> &data_info: model memory addr/size list.
 /// @param [in] const std::vector<DataBuffer> &blobs: user input data list.
 /// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input
 /// @param [in] ZeroCopyMode zero_copy_mode: input zero copy or output zero copy
 /// @param [in] string batch_label: batch label for multi-batch scenes
 /// @return SUCCESS handle successfully / others handle failed
 ///
-Status DavinciModel::ZeroCopyBlobs(const std::vector<void *> &addr_list, const std::vector<int64_t> &size_list,
+Status DavinciModel::ZeroCopyBlobs(const std::map<uint32_t, std::pair<int64_t, void *>> &data_info,
                                    const std::vector<DataBuffer> &blobs, bool is_dynamic_input,
                                    ZeroCopyMode zero_copy_mode, std::string batch_label) {
-  if ((blobs.size() != addr_list.size()) || (blobs.size() != size_list.size())) {
-    GELOGE(FAILED, "Blobs not match: blobs=%zu addr=%zu size=%zu", blobs.size(), addr_list.size(), size_list.size());
+  if (blobs.size() != data_info.size()) {
+    GELOGE(FAILED, "Blobs not match: blobs=%zu datas=%zu", blobs.size(), data_info.size());
     return FAILED;
   }
 
-  for (size_t idx = 0; idx < size_list.size(); ++idx) {
-    const DataBuffer &data_buf = blobs[idx];
+  for (const auto &data : data_info) {
+    if (data.first >= blobs.size()) {
+      GELOGE(FAILED, "Blobs not match: blobs=%zu, tensor=%zu, index=%u", blobs.size(), data_info.size(), data.first);
+      return FAILED;
+    }
+    int64_t mem_size = data.second.first;
+    void *mem_addr = data.second.second;
+
+    const DataBuffer &data_buf = blobs[data.first];
     if (data_buf.data == nullptr) {
-      GELOGE(FAILED, "data_buf.data is nullptr, index=%zu", idx);
+      GELOGE(FAILED, "data_buf.data is nullptr, index=%u", data.first);
       return FAILED;
     }
-    GELOGI("Copy Blobs %zu: Input data length is %u, Op data size is %u.", idx, data_buf.length, size_list[idx]);
 
-    if (!CheckInputAndModelSize(data_buf.length, size_list[idx], is_dynamic_input)) {
+    GELOGI("Copy Blobs %u: Input data length is %u, Op data size is %ld.", data.first, data_buf.length, mem_size);
+    if (!CheckInputAndModelSize(data_buf.length, mem_size, is_dynamic_input)) {
       GELOGE(FAILED, "Check input size and model size failed");
       return FAILED;
     }
@@ -2838,14 +2804,14 @@ Status DavinciModel::ZeroCopyBlobs(const std::vector<void *> &addr_list, const s
     }
 
     if (zero_copy_mode == kInputZeroCopy) {
-      if (ZeroCopyInputBlobs(addr_list[idx], size_list[idx], data_buf, zero_copy_mode, batch_label) != SUCCESS) {
+      if (ZeroCopyInputBlobs(mem_addr, mem_size, data_buf, zero_copy_mode, batch_label) != SUCCESS) {
         GELOGE(FAILED, "Zero copy input blobs failed");
         return FAILED;
       }
     }
 
     if (zero_copy_mode == kOutputZeroCopy && !is_dynamic_input) {
-      if (ZeroCopyImpl(addr_list[idx], data_buf, zero_copy_mode, batch_label) != SUCCESS) {
+      if (ZeroCopyImpl(mem_addr, data_buf, zero_copy_mode, batch_label) != SUCCESS) {
         GELOGE(FAILED, "Output zero copy data node copy failed");
         return FAILED;
       }
@@ -2940,11 +2906,21 @@ Status DavinciModel::ZeroCopyImpl(const void *src_addr, const DataBuffer &data_b
     if (!CheckDynamicBatchZeroCopyAddr(addr, dynamic_input_addrs, fix_input_addrs)) {
       continue;
     }
-    __builtin_prefetch(addr);
-    rtError_t rt_err = rtMemcpy(addr, sizeof(void *), &dst_addr, sizeof(void *), RT_MEMCPY_HOST_TO_DEVICE);
-    if (rt_err != RT_ERROR_NONE) {
-      GELOGE(FAILED, "ZeroCopyImpl: rtMemcpy failed");
-      return FAILED;
+
+    if (is_async_mode_) {
+      rtError_t rt_err =
+        rtMemcpyAsync(addr, kAddrLen, &dst_addr, kAddrLen, RT_MEMCPY_HOST_TO_DEVICE_EX, rt_model_stream_);
+      if (rt_err != RT_ERROR_NONE) {
+        GELOGE(FAILED, "ZeroCopyImpl: rtMemcpyAsync failed");
+        return FAILED;
+      }
+    } else {
+      __builtin_prefetch(addr);
+      rtError_t rt_err = rtMemcpy(addr, kAddrLen, &dst_addr, kAddrLen, RT_MEMCPY_HOST_TO_DEVICE);
+      if (rt_err != RT_ERROR_NONE) {
+        GELOGE(FAILED, "ZeroCopyImpl: rtMemcpy failed");
+        return FAILED;
+      }
     }
     GELOGI("[IMAS]refresh in/out addr new:%p, old:%p", dst_addr, src_addr);
   }
@@ -2999,7 +2975,7 @@ const char *DavinciModel::GetRegisterStub(const string &binfile, const string &s
 /// @brief Constant Op Init.
 /// @return Status
 ///
-Status DavinciModel::InitConstant(const ConstOpDescPtr &op_desc) const {
+Status DavinciModel::InitConstant(const OpDescPtr &op_desc) {
   auto v_weights = ModelUtils::GetWeights(op_desc);
   auto v_output_size = ModelUtils::GetOutputSize(op_desc);
   auto v_output_addr = ModelUtils::GetOutputDataAddrs(runtime_param_, op_desc);
@@ -3023,17 +2999,24 @@ Status DavinciModel::InitConstant(const ConstOpDescPtr &op_desc) const {
     /// the logic of GetShapeSize is wrong, the scaler tensor's GetShapeSize is zero
     /// and that of unknown shape is zero too.
     /// unknown shape will not appear here, so we can use zero judge a tensor is scaler or not
-    int64_t elem_num = tensor_shape.GetShapeSize() == 0 ? 1 : tensor_shape.GetShapeSize();
+    int64_t elem_num = tensor_shape.GetShapeSize();
+    if (elem_num == 0 && tensor_shape.GetDims().size() == 0) {
+      elem_num = 1;
+    }
     uint64_t *buff = reinterpret_cast<uint64_t *>(tensor->MutableData().data());
     GE_CHK_BOOL_RET_STATUS(ge::CheckInt64Uint32MulOverflow(elem_num, kBytes) == SUCCESS, FAILED,
                            "Shape size is invalid");
-    int64_t offset = elem_num * kBytes;
+    uint64_t offset = static_cast<uint64_t>(elem_num * kBytes);
 
-    uint64_t hbm_raw_data_base_addr = reinterpret_cast<uint64_t>(v_output_addr[0]) + offset;
+    uint64_t hbm_raw_data_base_addr =
+      reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(v_output_addr[0])) + offset;
     for (int64_t i = elem_num - 1; i >= 0; --i) {
       buff[i] = hbm_raw_data_base_addr + (buff[i] - buff[0]);
     }
   }
+  GELOGI("[IMAS]InitConstant memcpy graph_%u type[V] name[%s] output[%d] memaddr[%p] mem_size[%u] datasize[%zu]",
+         runtime_param_.graph_id, op_desc->GetName().c_str(), 0, v_output_addr[0], v_output_size[0],
+         tensor->GetData().size());
   GE_CHK_RT_RET(rtMemcpy(v_output_addr[0], v_output_size[0], tensor->GetData().data(), tensor->GetData().size(),
                          RT_MEMCPY_HOST_TO_DEVICE));
 
@@ -3143,45 +3126,48 @@ void DavinciModel::CleanTbeHandle() {
 /// @brief insert active_stream_indication_
 /// @return Status
 ///
-Status DavinciModel::MarkActiveStream(const OpDescPtr &op_desc) {
-  GE_CHECK_NOTNULL(op_desc);
-  std::string type = op_desc->GetType();
-  GE_IF_BOOL_EXEC(
-    type == STREAMSWITCH, std::vector<uint32_t> active_stream_list;
-    GE_LOGI_IF(!ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list),
-               "GetInt ACTIVE_STREAM_LIST failed.");
-    if (active_stream_list.size() != kTrueBranchStreamNum) {
-      GELOGE(INTERNAL_ERROR, "Stream num of switch true branch must be %u.", kTrueBranchStreamNum);
-      return INTERNAL_ERROR;
-    } uint32_t true_stream_id = active_stream_list.front();
-    active_stream_indication_.insert(true_stream_id);
-    GELOGI("flowctrl_op_index_map  node:%s, true_stream_id=%u.", op_desc->GetName().c_str(), true_stream_id););
-  GE_IF_BOOL_EXEC(
-    type == STREAMACTIVE, if (op_desc->HasAttr(ATTR_NAME_SWITCH_BRANCH_NODE_LABEL)) {
-      std::vector<uint32_t> active_stream_list;
-      GE_CHK_BOOL_EXEC(AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list),
-                       return INTERNAL_ERROR, "StreamActiveOp get attr ACTIVE_STREAM failed.");
-
-      for (size_t j = 0; j < active_stream_list.size(); ++j) {
-        active_stream_indication_.insert(active_stream_list[j]);
-        GELOGI("flowctrl_op_index_map  node:%s, active_stream_id=%u.", op_desc->GetName().c_str(),
-               active_stream_list[j]);
-      }
-    });
-
-  if (type == STREAMSWITCHN) {
+Status DavinciModel::InitStreamActive(const OpDescPtr &op_desc) {
+  if (op_desc->HasAttr(ATTR_NAME_SWITCH_BRANCH_NODE_LABEL)) {
     std::vector<uint32_t> active_stream_list;
-    if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list)) {
-      GELOGE(INTERNAL_ERROR, "StreamSwitchNOp get attr ACTIVE_STREAM failed.");
-      return INTERNAL_ERROR;
-    }
+    GE_CHK_BOOL_EXEC(AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list),
+                     return INTERNAL_ERROR, "StreamActiveOp get attr ACTIVE_STREAM failed.");
 
     for (size_t j = 0; j < active_stream_list.size(); ++j) {
       active_stream_indication_.insert(active_stream_list[j]);
-      GELOGI("StreamSwitchNOp node:%s, active_stream_id=%u.", op_desc->GetName().c_str(), active_stream_list[j]);
-    };
+      GELOGI("flowctrl_op_index_map  node:%s, active_stream_id=%u.", op_desc->GetName().c_str(), active_stream_list[j]);
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status DavinciModel::InitStreamSwitch(const OpDescPtr &op_desc) {
+  std::vector<uint32_t> active_stream_list;
+  GE_LOGI_IF(!ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list),
+             "GetInt ACTIVE_STREAM_LIST failed.");
+  if (active_stream_list.size() != kTrueBranchStreamNum) {
+    GELOGE(INTERNAL_ERROR, "Stream num of switch true branch must be %u.", kTrueBranchStreamNum);
+    return INTERNAL_ERROR;
+  }
+
+  uint32_t true_stream_id = active_stream_list.front();
+  active_stream_indication_.insert(true_stream_id);
+  GELOGI("flowctrl_op_index_map  node:%s, true_stream_id=%u.", op_desc->GetName().c_str(), true_stream_id);
+
+  return SUCCESS;
+}
+
+Status DavinciModel::InitStreamSwitchN(const OpDescPtr &op_desc) {
+  std::vector<uint32_t> active_stream_list;
+  if (!AttrUtils::GetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_stream_list)) {
+    GELOGE(INTERNAL_ERROR, "StreamSwitchNOp get attr ACTIVE_STREAM failed.");
+    return INTERNAL_ERROR;
+  }
+
+  for (size_t j = 0; j < active_stream_list.size(); ++j) {
+    active_stream_indication_.insert(active_stream_list[j]);
+    GELOGI("StreamSwitchNOp node:%s, active_stream_id=%u.", op_desc->GetName().c_str(), active_stream_list[j]);
   }
-  GELOGI("Flow control: active_stream_indication_ size = %zu.", active_stream_indication_.size());
 
   return SUCCESS;
 }
@@ -3205,12 +3191,11 @@ bool DavinciModel::IsBroadCastOpData(const ge::NodePtr &var_node) {
 /// @ingroup domi_ome
 /// @brief Init model stream for NN model.
 /// @param [in] stream   user input model stream.
-/// @param [in] async_mode  is asynchronize mode.
 /// @return Status
 ///
-Status DavinciModel::InitModelStream(rtStream_t stream, bool async_mode) {
+Status DavinciModel::InitModelStream(rtStream_t stream) {
   // asynchronize mode, use user input stream.
-  if (async_mode) {
+  if (is_async_mode_) {
     rt_model_stream_ = stream;
     is_inner_model_stream_ = false;
     return SUCCESS;
@@ -3245,16 +3230,12 @@ Status DavinciModel::InitModelStream(rtStream_t stream, bool async_mode) {
 ///
 Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputData &input_data,
                                OutputData &output_data) {
-  GELOGI("Model Run begin, model id:%u, data index:%u, flag:%d.", model_id_, input_data.index, async_mode);
-  GE_CHK_STATUS(InitModelStream(stream, async_mode), "Init model stream failed.");
-
-  GELOGI("do rtModelExecute task sink, model id:%u", input_data.model_id);
+  is_async_mode_ = async_mode;
+  GELOGI("Model Run begin, model id:%u, data index:%u, flag:%d.", model_id_, input_data.index, is_async_mode_);
+  GE_CHK_STATUS_RET(InitModelStream(stream), "Init model stream failed.");
 
-  auto enable_dump = false;
   auto dump_path = PropertiesManager::Instance().GetDumpOutputPath();
-  if (!dump_path.empty()) {
-    enable_dump = true;
-  }
+  auto enable_dump = !dump_path.empty();
 
   auto dump_op_env = std::getenv("DUMP_OP");
   if (dump_op_env != nullptr) {
@@ -3275,9 +3256,9 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa
     output_use_zero_copy_ = false;
   }
 
-  // Asynchronous mode depends on zero copy.
-  if (async_mode && !input_use_zero_copy_ && !output_use_zero_copy_ && !task_list_.empty()) {
-    GELOGE(FAILED, "Asynchronous mode but zero copy disabled.");
+  // Empty task, Just copy input to output, need direct copy.
+  if (task_list_.empty() && (input_use_zero_copy_ || output_use_zero_copy_)) {
+    GELOGE(FAILED, "Empty task, Just copy input to output, need direct copy.");
     return FAILED;
   }
 
@@ -3298,15 +3279,16 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa
     GELOGI("rtModelExecute end");
   }
 
-  GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_AFTER_PROC_START));
-  ret = output_use_zero_copy_ ? SyncDataAndDump() : CopyOutputData(input_data.index, output_data);
-  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return INTERNAL_ERROR, "Copy Output data to user failed.");
-  GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_AFTER_PROC_END));
+  if (!is_async_mode_) {
+    GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_AFTER_PROC_START));
+    ret = output_use_zero_copy_ ? SyncDataAndDump() : CopyOutputData(input_data.index, output_data);
+    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return INTERNAL_ERROR, "Copy Output data to user failed.");
+    GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), SetProfileTime(MODEL_AFTER_PROC_END));
+  }
 
   // report model time data
   GE_IF_BOOL_EXEC(ProfilingManager::Instance().ProfilingOn(), (void)SinkTimeProfile(input_data));
   GELOGI("Model run end, model id:%u", model_id_);
-  GEEVENT("Model Run thread end, model_id:%u", model_id_);
   return SUCCESS;
 }
 
diff --git a/src/ge/graph/load/new_model_manager/davinci_model.h b/src/ge/graph/load/new_model_manager/davinci_model.h
index 9ce02a42..76edd4a4 100644
--- a/src/ge/graph/load/new_model_manager/davinci_model.h
+++ b/src/ge/graph/load/new_model_manager/davinci_model.h
@@ -340,13 +340,6 @@ class DavinciModel {
                                            vector<InputOutputDescInfo> &output_desc,
                                            std::vector<uint32_t> &inputFormats, std::vector<uint32_t> &output_formats);
 
-  ///
-  /// @ingroup domi_ome
-  /// @brief copy input data to model
-  /// @return Status
-  ///
-  Status CopyInputDataToModel(const std::vector<DataBuffer> &data, uint32_t data_op_index, bool device_data);
-
   Status ReturnResult(uint32_t data_id, const bool rslt_flg, const bool seq_end_flg, OutputData *output_data);
 
   Status ReturnNoOutput(uint32_t data_id);
@@ -413,20 +406,6 @@ class DavinciModel {
   ///
   uint32_t GetDeviceId() const { return device_id_; }
 
-  ///
-  /// @ingroup domi_ome
-  /// @brief Set Train Mode
-  /// @return void
-  ///
-  void SetTrainMode(bool mode) { is_train_mode_ = mode; }
-
-  ///
-  /// @ingroup domi_ome
-  /// @brief Get Train Mode
-  /// @return bool true
-  ///
-  bool GetTrainMode() { return is_train_mode_; }
-
   GeModelPtr GetGeModel() { return ge_model_; }
 
   const RuntimeParam &GetRuntimeParam() { return runtime_param_; }
@@ -519,15 +498,14 @@ class DavinciModel {
   ///
   /// @ingroup ge
   /// @brief Copy Data addr to model for direct use.
-  /// @param [in] const vector<void *> &addrs: model input memory addr list.
-  /// @param [in] const vector<uint32_t> &sizes: model input memory size list.
+  /// @param [in] const std::map<uint32_t, std::pair<int64_t, void *>> &data_info: model memory addr/size list.
   /// @param [in] const std::vector<DataBuffer> &blobs: user input data list.
   /// @param [in] bool is_dynamic_input: whether is dynamic input, true: is dynamic input; false: not is dynamic input
   /// @param [in] ZeroCopyMode zero_copy_mode: input zero copy or output zero copy
   /// @param [in] string batch_label: batch label for multi-batch scenes
   /// @return SUCCESS handle successfully / others handle failed
   ///
-  Status ZeroCopyBlobs(const std::vector<void *> &addr_list, const std::vector<int64_t> &size_list,
+  Status ZeroCopyBlobs(const std::map<uint32_t, std::pair<int64_t, void *>> &data_info,
                        const std::vector<DataBuffer> &blobs, bool is_dynamic_input, ZeroCopyMode zero_copy_mode,
                        string batch_label);
 
@@ -610,11 +588,9 @@ class DavinciModel {
   /// @brief Data Op Initialize.
   /// @param [in] NodePtr: Data Op.
   /// @param [in/out] data_op_index: NetOutput addr size info.
-  /// @param [in/out] input_data_info: Data index and addr info {index, {size, addr}}.
   /// @return Status
   ///
-  Status InitDataOp(const NodePtr &node, uint32_t &data_op_index,
-                    std::map<uint32_t, std::pair<int64_t, void *>> &input_data_info);
+  Status InitDataOp(const NodePtr &node, uint32_t &data_op_index);
 
   ///
   /// @ingroup ge
@@ -633,19 +609,27 @@ class DavinciModel {
   Status InitNetOutput(const OpDescPtr &op_desc);
 
   ///
-  /// @ingroup ge
-  /// @brief Make Input and Output addr for feature use.
-  /// @param [in] input_data_info: Data index and addr info {index, {size, addr}}.
-  /// @return Status
-  ///
-  Status CombineDataInfo(const std::map<uint32_t, std::pair<int64_t, void *>> &input_data_info);
-
-  ///
   /// @ingroup domi_ome
   /// @brief Constant Op Init.
   /// @return Status
   ///
-  Status InitConstant(const ConstOpDescPtr &op_desc) const;
+  Status InitConstant(const OpDescPtr &op_desc);
+
+  Status InitVariable(const OpDescPtr &op_desc);
+
+  Status InitEndGraph(const OpDescPtr &op_desc);
+
+  /// @ingroup ge
+  /// @brief LabelSet Op Initialize.
+  /// @param [in] op_desc: LabelSet Op descriptor.
+  /// @return Status
+  Status InitLabelSet(const OpDescPtr &op_desc);
+
+  Status InitStreamSwitch(const OpDescPtr &op_desc);
+
+  Status InitStreamActive(const OpDescPtr &op_desc);
+
+  Status InitStreamSwitchN(const OpDescPtr &op_desc);
 
   ///
   /// @ingroup domi_ome
@@ -662,7 +646,7 @@ class DavinciModel {
   /// @brief Init model stream for NN model.
   /// @return Status
   ///
-  Status InitModelStream(rtStream_t stream, bool async_mode);
+  Status InitModelStream(rtStream_t stream);
 
   ///
   /// @ingroup ge
@@ -678,12 +662,16 @@ class DavinciModel {
   ///
   Status BindInputQueue();
 
+  Status CpuTaskModelZeroCopy(std::vector<uintptr_t> &mbuf_list,
+                              std::map<const void *, std::vector<void *>> &outside_addrs);
+
   ///
   /// @ingroup ge
   /// @brief ACL, Bind NetOutput Op addr to output queue.
   /// @return: 0 for success / others for fail
   ///
   Status BindOutputQueue();
+  Status CpuModelPrepareOutput(uintptr_t addr, uint32_t size);
 
   ///
   /// @ingroup ge
@@ -693,13 +681,6 @@ class DavinciModel {
   Status BindActiveStream();
 
   ///
-  /// @ingroup domi_ome
-  /// @brief insert active_stream_indication_
-  /// @return Status
-  ///
-  Status MarkActiveStream(const OpDescPtr &op_desc);
-
-  ///
   /// @ingroup ge
   /// @brief definiteness queue schedule, bind input queue to task.
   /// @param [in] queue_id: input queue id from user.
@@ -707,7 +688,7 @@ class DavinciModel {
   /// @param [in] size: Data Op output tensor size.
   /// @return: 0 for success / others for fail
   ///
-  Status CpuModelDequeue(uint32_t queue_id, uintptr_t addr, uint32_t size);
+  Status CpuModelDequeue(uint32_t queue_id);
 
   ///
   /// @ingroup ge
@@ -734,6 +715,8 @@ class DavinciModel {
   ///
   Status CpuWaitEndGraph();
 
+  Status BindEnqueue();
+  Status CpuModelEnqueue(uint32_t queue_id, uintptr_t out_mbuf);
   ///
   /// @ingroup ge
   /// @brief definiteness queue schedule, repeat run model.
@@ -783,10 +766,8 @@ class DavinciModel {
 
   vector<OpDescPtr> variable_op_list_;
 
-  vector<int64_t> output_size_list_;  // Init by NetOutput Input Tensor
-  vector<void *> output_addr_list_;   // Init by NetOutput Input Tensor
-  vector<int64_t> input_size_list_;   // Init by Data Output Tensor
-  vector<void *> input_addr_list_;    // Init by Data Output Tensor
+  std::map<uint32_t, std::pair<int64_t, void *>> input_data_info_;   // Init by Data Output Tensor
+  std::map<uint32_t, std::pair<int64_t, void *>> output_data_info_;  // Init by NetOutput Input Tensor
 
   // output op: save cce op actual needed memory size
   vector<int64_t> output_memory_size_list_;
@@ -813,6 +794,7 @@ class DavinciModel {
   vector<rtEvent_t> event_list_;
 
   vector<rtLabel_t> label_list_;
+  set<uint32_t> label_id_indication_;
 
   std::mutex outside_addrs_mutex_;
   std::map<const void *, std::vector<void *>> input_outside_addrs_;
@@ -830,6 +812,8 @@ class DavinciModel {
 
   bool is_inner_model_stream_;
 
+  bool is_async_mode_;  // For NN execute, Async mode use rtMemcpyAsync on rt_model_stream_.
+
   // ACL queue schedule, save queue ids for Init.
   std::vector<TaskInfoPtr> cpu_task_list_;
   std::vector<uint32_t> input_queue_ids_;    // input queue ids created by caller.
@@ -847,8 +831,6 @@ class DavinciModel {
 
   uint32_t device_id_;
 
-  bool is_train_mode_;
-
   std::mutex flowctrl_op_index_internal_map_mutex_;
   std::map<uint32_t, uint32_t> flowctrl_op_index_internal_map_;
   std::set<uint32_t> active_stream_indication_;
diff --git a/src/ge/graph/load/new_model_manager/model_manager.cc b/src/ge/graph/load/new_model_manager/model_manager.cc
index a1fefff2..1b6b30c2 100644
--- a/src/ge/graph/load/new_model_manager/model_manager.cc
+++ b/src/ge/graph/load/new_model_manager/model_manager.cc
@@ -358,26 +358,17 @@ Status ModelManager::DataInputTensor(uint32_t model_id, const std::vector<Tensor
   input_data.timestamp = 0;
   input_data.index = 0;
 
-  std::size_t index = 0;
-  for (const auto &op : model->GetDataList()) {
-    GE_CHECK_NOTNULL(op);
-    GE_CHECK_GE(inputs.size(), 1);
-    GE_CHECK_GE(inputs.size() - 1, index);
-
+  for (size_t i = 0; i < inputs.size(); ++i) {
     DataBuffer data;
-    data.data = inputs[index].data.data;
-    data.length = inputs[index].data.length;
+    data.data = inputs[i].data.data;
+    data.length = inputs[i].data.length;
     input_data.blobs.push_back(data);
-    index++;
   }
 
-  CHECK_FALSE_EXEC(input_data.blobs.size() >= inputs.size(),
-                   GELOGW("cur_inputs size = %zu, inputs size = %zu.", input_data.blobs.size(), inputs.size()););
-
   OutputData output_data;
   output_data.model_id = model_id;
   output_data.index = 0;
-  for (size_t i = 0; i < outputs.size(); i++) {
+  for (size_t i = 0; i < outputs.size(); ++i) {
     DataBuffer data;
     data.data = outputs[i].data.data;
     data.length = outputs[i].data.length;
@@ -675,6 +666,15 @@ Status ModelManager::LoadModelOffline(uint32_t &model_id, const ModelData &model
       break;
     }
     davinci_model->SetId(model_id);
+
+    int32_t device_id = 0;
+    rtError_t rt_ret = rtGetDevice(&device_id);
+    if (rt_ret != RT_ERROR_NONE || device_id < 0) {
+      GELOGE(RT_FAILED, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id);
+      return FAILED;
+    }
+    davinci_model->SetDeviceId(device_id);
+
     ret = davinci_model->Init(dev_ptr, mem_size, weight_ptr, weight_size);
     GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, break, "DavinciInit failed.");
 
diff --git a/src/ge/graph/load/new_model_manager/model_utils.cc b/src/ge/graph/load/new_model_manager/model_utils.cc
index 360a537f..dd2d20f6 100644
--- a/src/ge/graph/load/new_model_manager/model_utils.cc
+++ b/src/ge/graph/load/new_model_manager/model_utils.cc
@@ -53,27 +53,6 @@ bool ModelUtils::IsOutput(ConstOpDescPtr op_desc) {
 
 ///
 /// @ingroup domi_ome
-/// @brief Check is the Input need trans code.
-/// @return bool
-///
-bool ModelUtils::IsInputTensorNeedTrans(ConstOpDescPtr op_desc, size_t tensor_index) {
-  GE_CHECK_NOTNULL_EXEC(op_desc, return false);
-  const auto &input_desc = op_desc->MutableInputDesc(static_cast<uint32_t>(tensor_index));
-  const auto &output_desc = op_desc->MutableOutputDesc(static_cast<uint32_t>(tensor_index));
-  GE_CHECK_NOTNULL_EXEC(input_desc, return false);
-  GE_CHECK_NOTNULL_EXEC(output_desc, return false);
-
-  if ((output_desc->GetFormat() == FORMAT_NC1HWC0) && (output_desc->GetDataType() == DT_INT8)) {
-    // AIPP input, add attribute in data op to tag aipp
-    return false;
-  }
-
-  return (input_desc->GetFormat() != output_desc->GetFormat()) ||
-         (input_desc->GetDataType() != output_desc->GetDataType());
-}
-
-///
-/// @ingroup domi_ome
 /// @brief Get input size.
 /// @return vector<uint32_t>
 ///
@@ -398,6 +377,8 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
           GE_CHK_STATUS(TensorUtils::GetDataOffset(tensor_desc, data_offset));
           uint8_t *weight_addr = static_cast<uint8_t *>(weight_base + data_offset - logic_weight_base);
           v_input_data_addr.push_back(weight_addr);
+          GELOGI("[IMAS]GetInputDataAddrs graph_%u type[C] name[%s] input[%zu] memaddr[%p]", model_param.graph_id,
+                 op_desc->GetName().c_str(), i, weight_addr);
         });
       non_const_index++;
       continue;
@@ -411,7 +392,10 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
     non_const_index++;
     GE_IF_BOOL_EXEC(var_size != 0 && ge::VarManager::Instance(session_id)->IsVarAddr(input_offset),
                     uint8_t *variable_addr = var_base + input_offset - logic_var_base;
-                    v_input_data_addr.push_back(variable_addr); continue;);
+                    v_input_data_addr.push_back(variable_addr);
+                    GELOGI("[IMAS]GetInputDataAddrs graph_%u type[V] name[%s] input[%lu] memaddr[%p]",
+                           model_param.graph_id, op_desc->GetName().c_str(), i, variable_addr);
+                    continue;);
 
     bool input_tensor = false;
     GE_IF_BOOL_EXEC(TensorUtils::GetInputTensor(op_desc->GetOutputDesc(i), input_tensor) != GRAPH_SUCCESS,
@@ -421,12 +405,14 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
     uint8_t *mem_addr = nullptr;
     // l1 fusion
     if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) {
-      mem_addr = reinterpret_cast<uint8_t *>(input_offset);
+      mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(input_offset));
       v_input_data_addr.push_back(mem_addr);
     } else {
       mem_addr = static_cast<uint8_t *>(mem_base + input_offset - logic_mem_base);
       v_input_data_addr.push_back(mem_addr);
     }
+    GELOGI("[IMAS]GetInputDataAddrs graph_%u type[F] name[%s] input[%zu] memaddr[%p]", model_param.graph_id,
+           op_desc->GetName().c_str(), i, mem_addr);
   }
 
   return v_input_data_addr;
@@ -487,12 +473,14 @@ vector<void *> ModelUtils::GetOutputDataAddrs(const RuntimeParam &model_param, C
     uint8_t *mem_addr = nullptr;
     // l1 fusion
     if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) {
-      mem_addr = reinterpret_cast<uint8_t *>(v_output_offset[i]);
+      mem_addr = reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_output_offset[i]));
       v_output_data_addr.push_back(mem_addr);
     } else {
       mem_addr = static_cast<uint8_t *>(mem_base + v_output_offset[i] - logic_mem_base);
       v_output_data_addr.push_back(mem_addr);
     }
+    GELOGI("[IMAS]GetOutputDataAddrs graph_%u type[F] name[%s] output[%zu] memaddr[%p]", model_param.graph_id,
+           op_desc->GetName().c_str(), i, mem_addr);
   }
   return v_output_data_addr;
 }
@@ -530,7 +518,7 @@ vector<void *> ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param
     if (has_mem_type_attr && v_memory_type[i] != RT_MEMORY_HBM) {
       v_workspace_data_addr.push_back(reinterpret_cast<uint8_t *>(v_workspace_offset[i]));
       GELOGI("L1Fusion: op: %s, GetWorkspaceDataAddrs mem_addr[workspace index %zu]:%p", op_desc->GetName().c_str(), i,
-             reinterpret_cast<uint8_t *>(v_workspace_offset[i]));
+             reinterpret_cast<uint8_t *>(reinterpret_cast<intptr_t>(v_workspace_offset[i])));
     } else {
       int64_t workspace_offset = v_workspace_offset[i];
       int64_t workspace_bytes = v_workspace_bytes[i];
@@ -558,6 +546,7 @@ Status ModelUtils::ConvertVirtualAddressToPhysical(uint8_t *virtual_address, uin
     return RT_FAILED;
   }
 
+  GELOGD("virtual_address=%p, physical_address=%p", virtual_address, physical_address);
   return SUCCESS;
 }
 }  // namespace ge
diff --git a/src/ge/graph/load/new_model_manager/model_utils.h b/src/ge/graph/load/new_model_manager/model_utils.h
index 1a15c930..479cc431 100644
--- a/src/ge/graph/load/new_model_manager/model_utils.h
+++ b/src/ge/graph/load/new_model_manager/model_utils.h
@@ -42,13 +42,6 @@ class ModelUtils {
 
   ///
   /// @ingroup domi_ome
-  /// @brief Check is the Input need trans code.
-  /// @return bool
-  ///
-  static bool IsInputTensorNeedTrans(ConstOpDescPtr op_desc, size_t tensor_index);
-
-  ///
-  /// @ingroup domi_ome
   /// @brief Get input size.
   /// @return vector<uint32_t>
   ///
diff --git a/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc
index cb30092c..75acf548 100644
--- a/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/end_graph_task_info.cc
@@ -38,6 +38,7 @@ Status EndGraphTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
   }
 
   model_ = davinci_model->GetRtModelHandle();
+  GELOGI("InitEndGraphTaskInfo Init Success, model:%p, stream:%p", model_, stream_);
   return SUCCESS;
 }
 
diff --git a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
index 52511f03..3fa5eee2 100644
--- a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
@@ -125,6 +125,7 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m
       return RT_FAILED;
     }
 
+    GELOGD("hccl_stream addr is=%p", stream);
     hccl_stream_list_.push_back(stream);
     davinci_model->PushHcclStream(stream);
   }
@@ -245,6 +246,8 @@ void HcclTaskInfo::GetPrivateDefByTaskDef(const domi::TaskDef &task) {
         GELOGE(RT_FAILED, "Call rtMemcpy Fail, ret = 0x%X.", ret);
         return;
       }
+
+      GELOGI("The first address of the custom info, privateDef=%p.", private_def_);
     }
   }
 }
diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
index 88e8a1bb..faaa3f82 100644
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
@@ -41,6 +41,7 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
   }
 
   auto kernel_ex_def = task_def.kernel_ex();
+  const RuntimeParam &rts_param = davinci_model->GetRuntimeParam();
 
   // 1. Copy context from kernelExDef.private to workspace
   uint32_t op_index = kernel_ex_def.op_index();
@@ -50,12 +51,12 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
     return INTERNAL_ERROR;
   }
 
-  if (CopyTaskInfo(kernel_ex_def, davinci_model->GetRuntimeParam(), op_desc) != SUCCESS) {
+  if (CopyTaskInfo(kernel_ex_def, rts_param, op_desc) != SUCCESS) {
     GELOGE(FAILED, "copy task info to workspace failed.");
     return FAILED;
   }
 
-  vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
+  const vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc);
   if (workspace_data_addrs.empty()) {
     GELOGE(FAILED, "workspace_data_addrs is empty.");
     return FAILED;
@@ -79,16 +80,16 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
   uint64_t step_id_addr = 0;
   OpDescPtr step_id_node = davinci_model->GetVariableOp(NODE_NAME_GLOBAL_STEP);
   if (step_id_node != nullptr) {
-    vector<void *> v_step_id_addr = ModelUtils::GetOutputDataAddrs(davinci_model->GetRuntimeParam(), step_id_node);
+    vector<void *> v_step_id_addr = ModelUtils::GetOutputDataAddrs(rts_param, step_id_node);
     if (!v_step_id_addr.empty()) {
       step_id_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(v_step_id_addr[0]));
     }
   }
 
   // 3. Set workspaceaddr, inputOutputDataAddr
-  uint64_t workspace_base_addr = reinterpret_cast<uint64_t>(workspace_data_addrs[0]);
-  vector<void *> input_addrs = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
-  vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
+  uint64_t workspace_base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(workspace_data_addrs[0]));
+  const vector<void *> input_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc);
+  const vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
   vector<void *> io_addrs;
   io_addrs.insert(io_addrs.end(), input_addrs.begin(), input_addrs.end());
   io_addrs.insert(io_addrs.end(), output_addrs.begin(), output_addrs.end());
@@ -132,7 +133,13 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
   rt_ret = rtMemcpy(kernel_buf_, sizeof(STR_FWK_OP_KERNEL), static_cast<void *>(&fwk_op_kernel),
                     sizeof(STR_FWK_OP_KERNEL), RT_MEMCPY_HOST_TO_DEVICE);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy error, ret: Ox%X", rt_ret); return FAILED;)
-  davinci_model->SetZeroCopyAddr(op_desc, io_addrs, input_output_addr_);
+
+  vector<void *> virtual_io_addrs;  // use virtual address for zero copy key.
+  const vector<void *> virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false);
+  const vector<void *> virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false);
+  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_in_addrs.begin(), virtual_in_addrs.end());
+  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_out_addrs.begin(), virtual_out_addrs.end());
+  davinci_model->SetZeroCopyAddr(op_desc, virtual_io_addrs, input_output_addr_);
 
   kernel_buf_size_ = sizeof(STR_FWK_OP_KERNEL);
   davinci_model_ = davinci_model;
diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h
index 9aab55e7..a6419f9f 100644
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h
@@ -25,6 +25,7 @@ class KernelExTaskInfo : public TaskInfo {
  public:
   KernelExTaskInfo()
       : task_id_(0),
+        stream_id_(0),
         dump_flag_(RT_KERNEL_DEFAULT),
         kernel_buf_size_(0),
         davinci_model_(nullptr),
diff --git a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
index 407efd63..47956cf2 100644
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
@@ -221,13 +221,13 @@ Status KernelTaskInfo::SuperKernelLaunch() {
     return RT_FAILED;
   }
   // Call the fuse API
-  skt::SuperKernel *superKernel;
+  skt::SuperKernel *superKernel = nullptr;
   if (factory->FuseKernels(skt_kernel_list, skt_arg_list, skt_info_.last_block_dim, superKernel) != SUCCESS) {
     GELOGE(RT_FAILED, "SuperKernelLaunch: fuse call failed");
     return RT_FAILED;
   }
   // Launch a super kernel
-  if (superKernel->Launch(skt_info_.last_stream, true) != SUCCESS) {
+  if (superKernel->Launch(skt_info_.last_stream, RT_KERNEL_DUMPFLAG) != SUCCESS) {
     GELOGE(RT_FAILED, "SuperKernelLaunch: launch failed");
     return RT_FAILED;
   }
@@ -341,6 +341,7 @@ Status KernelTaskInfo::Distribute() {
   rtError_t rt_ret = RT_ERROR_NONE;
   char *skt_enable_env = getenv("SKT_ENABLE");
   int64_t env_flag = (skt_enable_env != nullptr) ? strtol(skt_enable_env, nullptr, 10) : 0;
+  bool call_skt = ((env_flag != 0) || is_l1_fusion_enable_);
   if (kernel_type_ == cce::ccKernelType::AI_CPU) {
     // blockDim is reserved parameter, set to 1
     rt_ret = rtCpuKernelLaunchWithFlag(reinterpret_cast<const void *>(so_name_.c_str()),
@@ -348,11 +349,10 @@ Status KernelTaskInfo::Distribute() {
                                        nullptr, stream_, dump_flag_);
   } else {
     /* default: not skt launch */
-    bool call_skt = ((env_flag != 0) || is_l1_fusion_enable_);
     GELOGI(
-      "KernelTaskInfo Distribute Start, sktenable:%ld taskid:%u sktid:%u last_sktid:%u stubfunc_name:%s "
+      "KernelTaskInfo Distribute Start, sktenable:%d taskid:%u sktid:%u last_sktid:%u stubfunc_name:%s "
       "stubfunc:%p blockdim:%u stream:%p",
-      env_flag, task_id_, skt_id_, skt_info_.last_task_id, stub_func_name_.c_str(), stub_func_, block_dim_, stream_);
+      call_skt, task_id_, skt_id_, skt_info_.last_task_id, stub_func_name_.c_str(), stub_func_, block_dim_, stream_);
     // l1 fusion enable and env flag open (kCloseSkt for skt debug)
     if (call_skt && (env_flag != kCloseSkt)) {
       GE_RETURN_IF_ERROR(SuperKernelDistribute());
@@ -371,7 +371,7 @@ Status KernelTaskInfo::Distribute() {
   GELOGI(
     "KernelTaskInfo Distribute Success. sktenable:%d taskid:%d sktid:%d stubfunc_name:%s stubfunc:%p "
     "blockdim:%d stream:%p",
-    env_flag, task_id_, skt_id_, stub_func_name_.c_str(), stub_func_, block_dim_, stream_);
+    call_skt, task_id_, skt_id_, stub_func_name_.c_str(), stub_func_, block_dim_, stream_);
   return SUCCESS;
 }
 
@@ -423,12 +423,12 @@ Status KernelTaskInfo::InitTVMTask(DavinciModel *davinci_model, uint16_t offset,
     stub_func_ = const_cast<char *>(bin_file_key);
   }
 
-  const vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
-  const vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
-  const vector<void *> workspace_data_addrs =
-    ModelUtils::GetWorkspaceDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
-  vector<void *> tensor_device_addrs;
+  const RuntimeParam &rts_param = davinci_model->GetRuntimeParam();
+  const vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc);
+  const vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
+  const vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc);
 
+  vector<void *> tensor_device_addrs;
   tensor_device_addrs.insert(tensor_device_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
   tensor_device_addrs.insert(tensor_device_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
   tensor_device_addrs.insert(tensor_device_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());
@@ -468,7 +468,13 @@ Status KernelTaskInfo::InitTVMTask(DavinciModel *davinci_model, uint16_t offset,
       reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(args_) + offset + sizeof(void *) * input_data_addrs.size());
   }
 
-  davinci_model_->SetZeroCopyAddr(op_desc, tensor_device_addrs, static_cast<char *>(args_) + offset);
+  vector<void *> virtual_io_addrs;  // use virtual address for zero copy key.
+  const vector<void *> virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false);
+  const vector<void *> virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false);
+  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_in_addrs.begin(), virtual_in_addrs.end());
+  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_out_addrs.begin(), virtual_out_addrs.end());
+  davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, static_cast<char *>(args_) + offset);
+
   // update origin l2 data
   string sm_desc = kernel_def.sm_desc();
   char *sm_contrl = nullptr;
@@ -516,6 +522,7 @@ Status KernelTaskInfo::InitAICPUCustomTask(const std::map<uint32_t, std::shared_
   }
 
   auto op_desc = iter->second;
+  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
 
   const domi::KernelContext &context = kernel_def.context();
   const uint32_t kCustomAicpuArgsLen = 5;
@@ -534,11 +541,8 @@ Status KernelTaskInfo::InitAICPUCustomTask(const std::map<uint32_t, std::shared_
     ctx_.argsOffset[i] = (reinterpret_cast<uint16_t *>(const_cast<char *>(context.args_offset().data())))[i];
   }
 
-  const std::vector<void *> input_data_addrs =
-    ModelUtils::GetInputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
-  const std::vector<void *> output_data_addrs =
-    ModelUtils::GetOutputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
-
+  const std::vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc);
+  const std::vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
   Status ret = StoreInputOutputTensor(input_data_addrs, output_data_addrs, ModelUtils::GetInputDescs(op_desc),
                                       ModelUtils::GetOutputDescs(op_desc));
 
@@ -583,15 +587,15 @@ Status KernelTaskInfo::InitAICPUCustomTask(const std::map<uint32_t, std::shared_
     }
   }
   *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[0])) =
-    reinterpret_cast<uint64_t>(custom_info_.input_descs);  // arg 0
+    reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.input_descs));  // arg 0
   *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[1])) =
-    reinterpret_cast<uint64_t>(custom_info_.input_addrs);  // arg 1
+    reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.input_addrs));  // arg 1
   *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[2])) =
-    reinterpret_cast<uint64_t>(custom_info_.output_descs);  // arg 2
+    reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.output_descs));  // arg 2
   *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[3])) =
-    reinterpret_cast<uint64_t>(custom_info_.output_addrs);  // arg 3
+    reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.output_addrs));  // arg 3
   *(reinterpret_cast<uint64_t *>(args + ctx_.argsOffset[4])) =
-    reinterpret_cast<uint64_t>(custom_info_.attr_handle);  // arg 4
+    reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(custom_info_.attr_handle));  // arg 4
 
   rt_ret = rtMalloc(&args_, args_size_, RT_MEMORY_HBM);
   if (rt_ret != RT_ERROR_NONE) {
@@ -606,8 +610,10 @@ Status KernelTaskInfo::InitAICPUCustomTask(const std::map<uint32_t, std::shared_
     return RT_FAILED;
   }
 
-  davinci_model_->SetZeroCopyAddr(op_desc, input_data_addrs, custom_info_.input_addrs);
-  davinci_model_->SetZeroCopyAddr(op_desc, output_data_addrs, custom_info_.output_addrs);
+  const vector<void *> virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false);
+  const vector<void *> virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false);
+  davinci_model_->SetZeroCopyAddr(op_desc, virtual_in_addrs, custom_info_.input_addrs);
+  davinci_model_->SetZeroCopyAddr(op_desc, virtual_out_addrs, custom_info_.output_addrs);
   return SUCCESS;
 }
 
@@ -714,8 +720,10 @@ Status KernelTaskInfo::InitAicpuTask(const std::map<uint32_t, OpDescPtr> &op_lis
   }
 
   OpDescPtr op_desc = iter->second;
-  vector<void *> input_addrs = ModelUtils::GetInputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
-  vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
+  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
+
+  vector<void *> input_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc);
+  vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
   vector<void *> io_addrs;
   io_addrs.insert(io_addrs.end(), input_addrs.begin(), input_addrs.end());
   io_addrs.insert(io_addrs.end(), output_addrs.begin(), output_addrs.end());
@@ -752,7 +760,13 @@ Status KernelTaskInfo::InitAicpuTask(const std::map<uint32_t, OpDescPtr> &op_lis
                                           sizeof(void *) * input_addrs.size());
   }
 
-  davinci_model_->SetZeroCopyAddr(op_desc, io_addrs, static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead));
+  vector<void *> virtual_io_addrs;  // use virtual address for zero copy key.
+  const vector<void *> virtual_in_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc, false);
+  const vector<void *> virtual_out_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc, false);
+  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_in_addrs.begin(), virtual_in_addrs.end());
+  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_out_addrs.begin(), virtual_out_addrs.end());
+  davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs,
+                                  static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead));
   return SUCCESS;
 }
 
@@ -977,7 +991,7 @@ Status KernelTaskInfo::SetFlowtable(std::string &flowtable, const domi::KernelDe
 
     *(reinterpret_cast<uint64_t *>(
       args + (reinterpret_cast<uint16_t *>(const_cast<char *>(context.args_offset().data())))[0])) =
-      reinterpret_cast<uint64_t>(flowtable_);
+      reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(flowtable_));
   }
   return SUCCESS;
 }
diff --git a/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc
new file mode 100644
index 00000000..9c5e4c29
--- /dev/null
+++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.cc
@@ -0,0 +1,149 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h"
+
+#include "framework/common/debug/ge_log.h"
+#include "graph/load/new_model_manager/davinci_model.h"
+
+namespace ge {
+Status MemcpyAddrAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
+  GELOGI("MemcpyAddrAsyncTaskInfo Init Start.");
+  if (davinci_model == nullptr) {
+    GELOGE(PARAM_INVALID, "davinci_model is null!");
+    return PARAM_INVALID;
+  }
+
+  Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList());
+  if (ret != SUCCESS) {
+    return ret;
+  }
+
+  auto memcpy_async_def = task_def.memcpy_async();
+
+  uint64_t logic_dst = memcpy_async_def.dst();
+  uint64_t logic_src = memcpy_async_def.src();
+
+  dst_max_ = memcpy_async_def.dst_max();
+
+  uint64_t update_base_addr = 0;
+  ret = GetUpdateBaseAddr(davinci_model, logic_src, update_base_addr);
+  if (ret != SUCCESS) {
+    return ret;
+  }
+  src_ = reinterpret_cast<uint8_t *>(update_base_addr + logic_src);
+  if (src_ == nullptr) {
+    GELOGE(PARAM_INVALID, "src_ is null!");
+    return PARAM_INVALID;
+  }
+
+  uint64_t mem_base = reinterpret_cast<uint64_t>(davinci_model->MemBase());
+  uint64_t logic_mem_base = davinci_model->GetRtBaseAddr();
+  dst_ = reinterpret_cast<uint8_t *>(mem_base + (logic_dst - logic_mem_base));
+  if (dst_ == nullptr) {
+    GELOGE(PARAM_INVALID, "dst_ is null!");
+    return PARAM_INVALID;
+  }
+
+  count_ = memcpy_async_def.count();
+  kind_ = memcpy_async_def.kind();
+
+  // malloc args memory
+  size_t args_size = sizeof(void *);
+  rtError_t rt_ret = rtMalloc(&args_, args_size * 2, RT_MEMORY_HBM);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
+    return RT_FAILED;
+  }
+
+  // copy orign src
+  GELOGI("src_args:%p, destMax:%zu, src_:%p, count=%zu, kind=%u", args_, args_size, src_, args_size,
+         RT_MEMCPY_HOST_TO_DEVICE);
+  rt_ret = rtMemcpy(args_, args_size, &src_, args_size, RT_MEMCPY_HOST_TO_DEVICE);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt api for src failed, ret: 0x%X", rt_ret);
+    return RT_FAILED;
+  }
+
+  // copy orign dst
+  GELOGI("dst_args:%p, destMax:%zu, dst_:%p, count=%zu, kind=%u",
+         reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(args_) + args_size), args_size, dst_, args_size,
+         RT_MEMCPY_HOST_TO_DEVICE);
+  rt_ret = rtMemcpy(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(args_) + args_size), args_size, &dst_,
+                    args_size, RT_MEMCPY_HOST_TO_DEVICE);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt api for dst failed, ret: 0x%X", rt_ret);
+    return RT_FAILED;
+  }
+
+  GELOGI("InitMemcpyAddrAsyncTaskInfo, logic_src:%p, logic_dst:%p, src:%p, dst:%p, src_args:%p, dst_args:%p",
+         reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(logic_src)),
+         reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(logic_dst)), src_, dst_, args_,
+         reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(args_) + args_size));
+
+  return SUCCESS;
+}
+
+Status MemcpyAddrAsyncTaskInfo::Distribute() {
+  GELOGI("MemcpyAddrAsyncTaskInfo Distribute Start.");
+  GELOGI("Distribute MemcpyAddrAsync, dst_max:%lu, count:%lu, kind:%u.", dst_max_, count_, kind_);
+
+  rtError_t rt_ret = rtMemcpyAsync(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(args_) + sizeof(void *)),
+                                   dst_max_, args_, count_, static_cast<rtMemcpyKind_t>(kind_), stream_);
+  if (rt_ret != RT_ERROR_NONE) {
+    GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
+    return RT_FAILED;
+  }
+
+  return SUCCESS;
+}
+
+Status MemcpyAddrAsyncTaskInfo::GetUpdateBaseAddr(DavinciModel *davinci_model, uint64_t update_addr,
+                                                  uint64_t &base_addr) {
+  GE_CHECK_NOTNULL(davinci_model);
+  uint64_t data_base_addr =
+    reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(davinci_model->MemBase())) - davinci_model->GetRtBaseAddr();
+  uint64_t weight_base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(davinci_model->WeightsMemBase())) -
+                              davinci_model->GetRtWeightAddr();
+  uint64_t var_base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(davinci_model->VarMemBase())) -
+                           davinci_model->GetRtVarAddr();
+
+  uint64_t data_base_addr_start = davinci_model->GetRtBaseAddr();
+  uint64_t data_base_addr_end = davinci_model->GetRtBaseAddr() + davinci_model->TotalMemSize();
+  uint64_t wight_base_addr_start = davinci_model->GetRtWeightAddr();
+  uint64_t wight_base_addr_end = davinci_model->GetRtWeightAddr() + davinci_model->TotalWeightsMemSize();
+  uint64_t varible_base_addr_start = davinci_model->GetRtVarAddr();
+  uint64_t varible_base_addr_end = davinci_model->GetRtVarAddr() + davinci_model->TotalVarMemSize();
+
+  if ((data_base_addr_start <= update_addr) && (update_addr <= data_base_addr_end)) {
+    base_addr = data_base_addr;
+    GELOGI("The update_addr is data address.");
+  } else if ((wight_base_addr_start <= update_addr) && (update_addr <= wight_base_addr_end)) {
+    base_addr = weight_base_addr;
+    GELOGI("The update_addr is weight address.");
+  } else if ((varible_base_addr_start <= update_addr) && (update_addr <= varible_base_addr_end)) {
+    base_addr = var_base_addr;
+    GELOGI("The update_addr is variable address.");
+  } else if (update_addr != 0) {
+    base_addr = 0;
+    GELOGE(PARAM_INVALID, "The update_addr is abnormal.");
+    return PARAM_INVALID;
+  }
+  return SUCCESS;
+}
+
+REGISTER_TASK_INFO(RT_MODEL_TASK_MEMCPY_ADDR_ASYNC, MemcpyAddrAsyncTaskInfo);
+}  // namespace ge
diff --git a/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h
new file mode 100644
index 00000000..9252e43a
--- /dev/null
+++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_addr_async_task_info.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ADDR_ASYNC_TASK_INFO_H_
+#define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ADDR_ASYNC_TASK_INFO_H_
+#include "graph/load/new_model_manager/task_info/task_info.h"
+
+namespace ge {
+class MemcpyAddrAsyncTaskInfo : public TaskInfo {
+ public:
+  MemcpyAddrAsyncTaskInfo() : dst_(nullptr), dst_max_(0), src_(nullptr), args_(nullptr), count_(0), kind_(0) {}
+
+  ~MemcpyAddrAsyncTaskInfo() override {
+    src_ = nullptr;
+    dst_ = nullptr;
+
+    if (args_ != nullptr) {
+      rtError_t ret = rtFree(args_);
+      if (ret != RT_ERROR_NONE) {
+        GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", ret);
+      }
+    }
+
+    args_ = nullptr;
+  }
+
+  Status Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;
+
+  Status Distribute() override;
+
+ private:
+  Status GetUpdateBaseAddr(DavinciModel *davinci_model, uint64_t update_addr, uint64_t &base_addr);
+
+  void *dst_;
+  uint64_t dst_max_;
+  void *src_;
+  void *args_;
+  uint64_t count_;
+  uint32_t kind_;
+};
+}  // namespace ge
+#endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_MEMCPY_ADDR_ASYNC_TASK_INFO_H_
diff --git a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
index cdd9eb37..c783c718 100644
--- a/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/memcpy_async_task_info.cc
@@ -51,6 +51,9 @@ Status MemcpyAsyncTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *da
 
   count_ = memcpy_async_def.count();
   kind_ = memcpy_async_def.kind();
+  GELOGI("MemcpyAsyncTaskInfo Init Success, logic_src:%p, logic_dst:%p, src:%p, dst:%p",
+         reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(logic_src)),
+         reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(logic_dst)), src_, dst_);
 
   return SUCCESS;
 }
diff --git a/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc
index 3d73b9cb..21c80c83 100644
--- a/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/stream_active_task_info.cc
@@ -63,6 +63,8 @@ Status StreamActiveTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d
 
   active_stream_ = davinci_model->GetStreamList()[active_stream_index_list[internal_index]];
   active_stream_id_ = stream_active_def.active_stream_id();
+  GELOGI("InitStreamActiveTaskInfo Init Success, index:%u, activeStream:%p, activeStreamID:%u.", internal_index,
+         active_stream_, active_stream_id_);
 
   return SUCCESS;
 }
@@ -74,6 +76,8 @@ Status StreamActiveTaskInfo::Distribute() {
     GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
     return RT_FAILED;
   }
+
+  GELOGI("StreamActiveTaskInfo Distribute Success. activeStreamID:%p.", active_stream_);
   return SUCCESS;
 }
 
diff --git a/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc b/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc
index c14a0e1f..a54bf012 100644
--- a/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/stream_switch_task_info.cc
@@ -95,6 +95,10 @@ Status StreamSwitchTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *d
     }
     data_type_ = static_cast<rtSwitchDataType_t>(data_type);
   }
+
+  GELOGI("InitStreamSwitchTaskInfo Init Success, cond:%d, trueStream:%p, trueStreamID:%u, datatype:%d.", cond_,
+         true_stream_, true_stream_id_, data_type_);
+
   return SUCCESS;
 }
 
@@ -105,6 +109,8 @@ Status StreamSwitchTaskInfo::Distribute() {
     GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
     return RT_FAILED;
   }
+
+  GELOGI("StreamSwitchTaskInfo Distribute Success. cond:%d, stream:%p, datatype:%d.", cond_, true_stream_, data_type_);
   return SUCCESS;
 }
 
diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc
index 38dbd8b3..b8fc77ac 100644
--- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.cc
@@ -19,17 +19,17 @@
 
 namespace ge {
 namespace skt {
-Status SuperKernel::Launch(rtStream_t stream, bool dump_flag) {
+Status SuperKernel::Launch(rtStream_t stream, uint32_t dump_flag) {
   const void *func_stub_ = this->GetFuncStub();
 
-  const void *args[] = {this->GetNavTablePtr(), (const void *)this->GetNavTableSize()};
+  const void *args[] = {this->GetNavTablePtr(),
+                        reinterpret_cast<const void *>(reinterpret_cast<uintptr_t>(this->GetNavTableSize()))};
 
-  void *device_args_addr = nullptr;
-  rtError_t rt_ret = rtMalloc((void **)&(device_args_addr), sizeof(args), RT_MEMORY_HBM);
+  rtError_t rt_ret = rtMalloc((void **)&(device_args_addr_), sizeof(args), RT_MEMORY_HBM);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failied. error: 0x%X", rt_ret); return FAILED;)
-  rt_ret = rtMemcpy((void *)device_args_addr, sizeof(args), (void *)args, sizeof(args), RT_MEMCPY_HOST_TO_DEVICE);
+  rt_ret = rtMemcpy((void *)device_args_addr_, sizeof(args), (void *)args, sizeof(args), RT_MEMCPY_HOST_TO_DEVICE);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failied. error: 0x%X", rt_ret); return FAILED;)
-  rt_ret = rtKernelLaunchWithFlag((void *const)func_stub_, block_dim_, device_args_addr, sizeof(args), NULL, stream,
+  rt_ret = rtKernelLaunchWithFlag((void *const)func_stub_, block_dim_, device_args_addr_, sizeof(args), NULL, stream,
                                   dump_flag);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelLaunchWithFlag failied. error: 0x%X", rt_ret);
                   return FAILED;)
diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h
index b662d97b..1c31acd1 100644
--- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h
+++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel.h
@@ -25,6 +25,7 @@ namespace ge {
 namespace skt {
 class SuperKernel {
  private:
+  void *device_args_addr_ = nullptr;
   const void *func_stub_;
   void *dev_nav_table_;
   uint64_t nav_table_size_;
@@ -33,8 +34,18 @@ class SuperKernel {
  public:
   SuperKernel(const void *stub, void *ptr, uint64_t sz, uint32_t dim)
       : func_stub_(stub), dev_nav_table_(ptr), nav_table_size_(sz), block_dim_(dim) {}
-  ~SuperKernel() {}
-  Status Launch(rtStream_t stream, bool dump_flag);
+  ~SuperKernel() {
+    // free memory when all releasing
+    if (device_args_addr_ != nullptr) {
+      GE_CHK_RT(rtFree(device_args_addr_));
+      GELOGI("SKT: super_kernel args addr free.");
+    }
+    if (dev_nav_table_ != nullptr) {
+      GE_CHK_RT(rtFree(dev_nav_table_));
+      GELOGI("SKT: super_kernel args addr free.");
+    }
+  }
+  Status Launch(rtStream_t stream, uint32_t dump_flag);
   const void *GetFuncStub() const { return func_stub_; }
   const void *GetNavTablePtr() const { return dev_nav_table_; }
   uint64_t GetNavTableSize() const { return nav_table_size_; }
diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
index ab3f68f1..63107f5e 100644
--- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.cc
@@ -30,26 +30,26 @@ Status SuperKernelFactory::Init() {
     rt_ret = rtGetFunctionByName(this->sk_stub_name_.c_str(), &this->func_stub_);
     GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret,
                                                     "rtGetFunctionByName "
-                                                    "failied. stub_func: %s",
+                                                    "failed. stub_func: %s",
                                                     this->sk_stub_name_.c_str());
                     return FAILED;)
     rt_ret = rtGetAddrByFun(this->func_stub_, &this->func_ptr_);
-    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failied. error: 0x%X", rt_ret);
+    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failed. error: 0x%X", rt_ret);
                     return FAILED;)
     if (this->use_physical_address_ != nullptr) {
       void *skt_func = nullptr;
       rt_ret = rtKernelConfigTransArg(this->func_ptr_, sizeof(uint64_t), 0, &skt_func);
-      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret);
+      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret);
                       return FAILED;)
       GELOGD(
         "SKT: fuseKernels super_kernel_template subFunc %p, device func "
         "address %p, device physic PC %p",
-        (uint64_t)this->func_stub_, (uint64_t)this->func_ptr_, (uint64_t)skt_func);
+        this->func_stub_, this->func_ptr_, skt_func);
     } else {
       GELOGD(
         "SKT: fuseKernels super_kernel_template subFunc %p, device func "
         "address %p",
-        (uint64_t)this->func_stub_, (uint64_t)this->func_ptr_);
+        this->func_stub_, this->func_ptr_);
     }
   }
   is_init_ = true;
@@ -94,63 +94,66 @@ Status SuperKernelFactory::FuseKernels(const std::vector<void *> &stub_func_list
   uint64_t nav_table_size = 2 * stub_func_list.size() * sizeof(int64_t);
 
   rtError_t rt_ret;
+  void *hbm_nav_table_addr = nullptr;
   if (this->use_physical_address_ != nullptr) {
     for (unsigned i = 0; i < stub_func_list.size(); i++) {
       void *sub_device_func = nullptr;
       rt_ret = rtGetAddrByFun(stub_func_list[i], &sub_device_func);
-      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failied. error: 0x%X", rt_ret);
+      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failed. error: 0x%X", rt_ret);
                       return FAILED;)
       void *sub_device_func_pys = nullptr;
       void *args_addr_pys = nullptr;
       rt_ret = rtKernelConfigTransArg(sub_device_func, sizeof(uint64_t), 0, &sub_device_func_pys);
-      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret);
+      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret);
                       return FAILED;)
       rt_ret = rtKernelConfigTransArg(args_addr_list[i], sizeof(uint64_t), 0, &args_addr_pys);
-      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret);
+      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret);
                       return FAILED;)
       GELOGD(
         "SKT: fuseKernels subFunc %p, device func address %p, device "
         "physic func address %p",
-        stub_func_list[i], (uint64_t)sub_device_func, (uint64_t)sub_device_func_pys);
-      nav_table[i * 2] = (uint64_t)sub_device_func_pys / 4;
-      GELOGD("SKT: CALL offet %p", nav_table[i * 2]);
-      nav_table[i * 2 + 1] = (uint64_t)args_addr_pys;
+        stub_func_list[i], sub_device_func, sub_device_func_pys);
+      // store two uint64_t address
+      // address divided by 4 because of 32bits encoding, call offset will *4 when calculating
+      nav_table[i * 2] = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(sub_device_func_pys)) / 4;
+      GELOGD("SKT: CALL offset %p", nav_table[i * 2]);
+      nav_table[i * 2 + 1] = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(args_addr_pys));
+
       GELOGD("SKT: fuseKernels args base address %p", nav_table[i * 2 + 1]);
     }
 
-    void *hbm_nav_table_addr = nullptr;
     void *hbm_nav_table_addr_pys = nullptr;
     rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM);
-    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failied. error: 0x%X", rt_ret); return FAILED;)
+    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failed. error: 0x%X", rt_ret); return FAILED;)
     rt_ret =
       rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table, nav_table_size, RT_MEMCPY_HOST_TO_DEVICE);
-    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failied. error: 0x%X", rt_ret); return FAILED;)
+    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failed. error: 0x%X", rt_ret); return FAILED;)
     rt_ret = rtKernelConfigTransArg(hbm_nav_table_addr, sizeof(uint64_t), 0, &hbm_nav_table_addr_pys);
-    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failied. error: 0x%X", rt_ret);
+    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtKernelConfigTransArg failed. error: 0x%X", rt_ret);
                     return FAILED;)
 
-    GELOGD("SKT: hbm_nav_table_addr %p, hbm_nav_table_addr_pys %p", (uint64_t)hbm_nav_table_addr,
-           (uint64_t)hbm_nav_table_addr_pys);
+    GELOGD("SKT: hbm_nav_table_addr %p, hbm_nav_table_addr_pys %p", hbm_nav_table_addr, hbm_nav_table_addr_pys);
     // Create the necessary metadata for the super kernel
     h = new SuperKernel(this->func_stub_, hbm_nav_table_addr_pys, nav_table_size, block_dim);
   } else {
     for (unsigned i = 0; i < stub_func_list.size(); i++) {
       void *sub_device_func = nullptr;
       rt_ret = rtGetAddrByFun(stub_func_list[i], &sub_device_func);
-      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failied. error: 0x%X", rt_ret);
+      GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtGetAddrByFun failed. error: 0x%X", rt_ret);
                       return FAILED;)
-      GELOGD("SKT: fuseKernels subFunc %p, device func address %p", stub_func_list[i], (uint64_t)sub_device_func);
-      nav_table[i * 2] = (uint64_t)sub_device_func / 4;
+      GELOGD("SKT: fuseKernels subFunc %p, device func address %p", stub_func_list[i], sub_device_func);
+      // store two uint64_t address
+      // address divided by 4 because of 32bits encoding, call offset will *4 when calculating
+      nav_table[i * 2] = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(sub_device_func)) / 4;
       GELOGD("SKT: CALL offet %p", nav_table[i * 2]);
-      nav_table[i * 2 + 1] = (uint64_t)args_addr_list[i];
+      nav_table[i * 2 + 1] = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(args_addr_list[i]));
       GELOGD("SKT: fuseKernels args base address %p", nav_table[i * 2 + 1]);
     }
-    void *hbm_nav_table_addr = nullptr;
     rt_ret = rtMalloc((void **)&hbm_nav_table_addr, nav_table_size, RT_MEMORY_HBM);
-    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failied. error: 0x%X", rt_ret); return FAILED;)
+    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc failed. error: 0x%X", rt_ret); return FAILED;)
     rt_ret =
       rtMemcpy((void *)hbm_nav_table_addr, nav_table_size, (void *)nav_table, nav_table_size, RT_MEMCPY_HOST_TO_DEVICE);
-    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failied. error: 0x%X", rt_ret); return FAILED;)
+    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy failed. error: 0x%X", rt_ret); return FAILED;)
     // Create the necessary metadata for the super kernel
     h = new SuperKernel(this->func_stub_, hbm_nav_table_addr, nav_table_size, block_dim);
   }
diff --git a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h
index 7b59d4bf..7ceb5cfa 100644
--- a/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h
+++ b/src/ge/graph/load/new_model_manager/task_info/super_kernel/super_kernel_factory.h
@@ -31,12 +31,12 @@ class SuperKernelFactory {
   const char *use_physical_address_ = getenv("GE_USE_PHYSICAL_ADDRESS");
   bool is_init_ = false;
   SuperKernelFactory(){};
+  ~SuperKernelFactory(){};
 
  public:
   SuperKernelFactory(SuperKernelFactory const &) = delete;
   void operator=(SuperKernelFactory const &) = delete;
   static SuperKernelFactory &GetInstance();
-  SuperKernelFactory(const std::string &sk_stub_name_, const std::string &bin_file);
   Status Init();
   Status Uninitialize();
   Status FuseKernels(const std::vector<void *> &stub_func_list, const std::vector<void *> &args_addr_list,
diff --git a/src/ge/graph/manager/graph_manager.cc b/src/ge/graph/manager/graph_manager.cc
index f6fc8389..d4680d94 100644
--- a/src/ge/graph/manager/graph_manager.cc
+++ b/src/ge/graph/manager/graph_manager.cc
@@ -33,6 +33,7 @@
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/ge_types.h"
+#include "graph/manager/util/rt_context_util.h"
 #include "graph/common/transop_util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/ge_context.h"
@@ -117,6 +118,7 @@ Status GraphManager::Initialize(const std::map<string, string> &options) {
   }
 
   graph_map_.clear();
+  cache_helper_map_.clear();
   init_flag_ = true;
 
   thread_run_flag_ = true;
@@ -180,6 +182,7 @@ Status GraphManager::Finalize() {
     }
   }
   graph_map_.clear();
+  cache_helper_map_.clear();
 
   // graph context
   if (graph_context_ != nullptr) {
@@ -426,6 +429,13 @@ Status GraphManager::PreRun(const GraphNodePtr &graph_node, const std::vector<Ge
   sub_graph_list[0]->SetSubGraph(merged_compute_graph);
   // set subgraphlist to graphnode
   graph_node->SetSubGraph(sub_graph_list);
+  // when set incre build, save om model and var manager
+  auto save_ret = SaveCacheAfterBuild(graph_node->GetGraphId(), merged_compute_graph, ge_model);
+  if (save_ret != SUCCESS) {
+    GELOGW("Fail to save cache.");
+  }
+  // release rts generate context
+  RtContextUtil::GetInstance().DestroyrtContexts();
   GE_TIMESTAMP_END(PreRun, "GraphManager::PreRun");
   GEEVENT("[GEPERFTRACE] GE PreRun End");
   return ret;
@@ -444,10 +454,14 @@ Status GraphManager::StartForRunGraph(const GraphNodePtr &graph_node, const std:
       return PARAM_INVALID;
     }
     GeModelPtr ge_model = nullptr;
-    ret = PreRun(graph_node, inputs, ge_models, ge_model, session_id);
+    // check need incre build.
+    ret = IncreBuild(graph_node, ge_model);
     if (ret != SUCCESS) {
-      GELOGE(ret, "PreRun Failed.");
-      return ret;
+      ret = PreRun(graph_node, inputs, ge_models, ge_model, session_id);
+      if (ret != SUCCESS) {
+        GELOGE(ret, "PreRun Failed.");
+        return ret;
+      }
     }
     ret = LoadGraph(ge_model, graph_node);
     if (ret != SUCCESS) {
@@ -492,6 +506,90 @@ Status GraphManager::LoadGraph(const GeModelPtr &ge_model, const GraphNodePtr &g
   return SUCCESS;
 }
 
+Status GraphManager::LoadFromCache(const GraphNodePtr &graph_node, const ModelCacheHelperPtr &cache_helper,
+                                   GeModelPtr &ge_model) {
+  auto graph_id = graph_node->GetGraphId();
+  auto ret = cache_helper->LoadOmModelFromCache(ge_model);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to load om model from cache.");
+    if (cache_helper->ClearCache(graph_id) != SUCCESS) {
+      GELOGW("Fail to clear cache of graph %u.", graph_id);
+    }
+    return FAILED;
+  }
+  ret = cache_helper->RecoverVarManagerFromCache();
+  if (ret != SUCCESS) {
+    GELOGW("Fail to recover VarManager from cache.");
+    if (cache_helper->ClearCache(graph_id) != SUCCESS) {
+      GELOGW("Fail to clear cache of graph %u.", graph_id);
+    }
+    return FAILED;
+  }
+  ComputeGraphPtr compute_graph_in_model = GraphUtils::GetComputeGraph(ge_model->GetGraph());
+  if (compute_graph_in_model == nullptr) {
+    GELOGW("Error occurred when get compute graph from om, abandon.");
+    return FAILED;
+  } else {
+    graph_node->SetComputeGraph(compute_graph_in_model);
+    graph_node->SetGeModel(ge_model);
+    GELOGI("Load model and graph form cache om file.");
+  }
+  return SUCCESS;
+}
+
+Status GraphManager::SaveCacheBeforeBuild(uint32_t graph_id, const ModelCacheHelperPtr &cache_helper) {
+  auto ret = cache_helper->SaveCacheInfoToCache();
+  if (ret != SUCCESS) {
+    GELOGW("Fail to save cache info of graph[%d] to cache.", graph_id);
+    return FAILED;
+  }
+  ret = cache_helper->SaveVarManagerToCache(true);
+  if (ret != SUCCESS) {
+    GELOGW("Fail to save var manager to cache.");
+    cache_helper->ClearCache(graph_id);
+    return FAILED;
+  }
+  GELOGI("Cache files have been saved.");
+  return SUCCESS;
+}
+
+Status GraphManager::SaveCacheAfterBuild(uint32_t graph_id, ge::ComputeGraphPtr graph, GeModelPtr &ge_model) {
+  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
+  if ((instance_ptr == nullptr) || !instance_ptr->InitFlag()) {
+    GELOGW("GELib not initialized.");
+    return FAILED;
+  }
+
+  if (instance_ptr->IsIncreBuild()) {
+    auto iter = cache_helper_map_.find(graph_id);
+    if (iter == cache_helper_map_.end()) {
+      GELOGW("Can not find ModelCacheHelper of graph[%u]", graph_id);
+      return FAILED;
+    } else {
+      ModelCacheHelperPtr cache_helper = iter->second;
+      auto ret = cache_helper->RefreshComputeGraph(graph);
+      if (ret != SUCCESS) {
+        cache_helper->ClearCache(graph_id);
+        GELOGW("Fail to refresh cache helper's compute graph");
+        return FAILED;
+      }
+      ret = cache_helper->SaveVarManagerToCache(false);
+      if (ret != SUCCESS) {
+        cache_helper->ClearCache(graph_id);
+        GELOGW("Fail to save VarManager to cache");
+        return FAILED;
+      }
+      ret = cache_helper->SaveOmModelToCache(ge_model);
+      if (ret != SUCCESS) {
+        cache_helper->ClearCache(graph_id);
+        GELOGW("Fail to save om model to cache");
+        return FAILED;
+      }
+    }
+  }
+  return SUCCESS;
+}
+
 Status GraphManager::InnerRunGraph(GraphNodePtr &graph_node, const GraphId &graph_id,
                                    const std::vector<GeTensor> &inputs, std::vector<GeTensor> &outputs) {
   Status ret = graph_executor_.SetCondition(&sync_run_mutex_, &condition_, graph_run_listener_);
@@ -551,6 +649,9 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vector<GeTenso
                     GELOGE(GE_GRAPH_GRAPH_NODE_NULL, "[RunGraph] compute_graph_tmp is NULL, graph id = %u.", graph_id);
                     return GE_GRAPH_GRAPH_NODE_NULL;))
 
+  // when set incre build, add cache helper map
+  AddModelCacheHelperToMap(graph_id, session_id, compute_graph_tmp);
+
   std::vector<GeModelPtr> ge_models;
 
   if (options_.local_fmk_op_flag) {
@@ -583,7 +684,7 @@ Status GraphManager::RunGraph(const GraphId &graph_id, const std::vector<GeTenso
     if (!all_sub_graph.empty()) {
       auto checkPointGraph = all_sub_graph[0]->GetSubGraph();
       if (IsCheckpointGraph(checkPointGraph)) {
-        ret = CheckpointHandle(graph_id, outputs);
+        ret = CheckpointHandle(graph_id, checkPointGraph, outputs);
         if (ret != SUCCESS) {
           GELOGE(ret, "[RunGraph] CheckpointHandle failed!");
         }
@@ -667,6 +768,15 @@ Status GraphManager::SaveParams(ge::GeModel &model, const std::string &type, con
   return SUCCESS;
 }
 
+void GraphManager::RemoveModelCacheHelper(const GraphId &graph_id) {
+  auto iter = cache_helper_map_.find(graph_id);
+  if (iter != cache_helper_map_.end()) {
+    cache_helper_map_.erase(iter);
+  } else {
+    GELOGW("[GraphManager] cache helper does not exist, graph_id = %u", graph_id);
+  }
+}
+
 Status GraphManager::RemoveGraph(const GraphId &graph_id) {
   auto it = graph_map_.find(graph_id);
   if (it == graph_map_.end()) {
@@ -716,6 +826,9 @@ Status GraphManager::RemoveGraph(const GraphId &graph_id) {
   }
   var_acc_ctrl_.RemoveGraph(graph_id);
   graph_map_.erase(it);
+
+  RemoveModelCacheHelper(graph_id);
+
   auto ge_model = graph_node->GetGeModel();
   if (ge_model != nullptr) {
     GELOGI("Unload model %u.", ge_model->GetModelId());
@@ -1106,21 +1219,15 @@ Status GraphManager::SummaryHandle(const GraphId &graph_id, std::vector<GeTensor
   return SUCCESS;
 }
 
-Status GraphManager::CheckpointHandle(const GraphId &graph_id, const std::vector<GeTensor> &outputs) {
+Status GraphManager::CheckpointHandle(const GraphId &graph_id, const ComputeGraphPtr &compute_graph,
+                                      const std::vector<GeTensor> &outputs) {
   GELOGI("[GraphManager] CheckpointHandle, outputsSize=%zu.", outputs.size());
   std::vector<InputOutputDescInfo> outputs_desc = graph_executor_.GetOutputsDesc();
   GELOGI("[GraphManager] CheckpointHandle, outputsDescSize=%zu.", outputs_desc.size());
-  // find graph
-  GraphNodePtr graph_node = nullptr;
-  Status ret = GetGraphNode(graph_id, graph_node);
-  if (ret != SUCCESS) {
-    GELOGE(ret, "[CheckpointHandle] graph not exist, graph_id = %u.", graph_id);
-    return ret;
-  }
-  ComputeGraphPtr compute_graph_ptr = GraphUtils::GetComputeGraph(*(graph_node->GetGraph()));
+
   std::map<string, Tensor> save_results;
   NodePtr netoutput = nullptr;
-  for (const auto &node : compute_graph_ptr->GetDirectNode()) {
+  for (const auto &node : compute_graph->GetDirectNode()) {
     if (node->GetType() == kNetOutput) {
       netoutput = node;
       break;
@@ -1248,6 +1355,8 @@ bool GraphManager::CheckTransOpForCheckpointGraph(NodePtr &node) {
   return true;
 }
 
+static inline bool CheckConstanOpForCheckpointGraph(NodePtr &node) { return node->GetOutDataNodes().empty(); }
+
 bool GraphManager::IsCheckpointGraph(ComputeGraphPtr &compute_graph) {
   if (compute_graph == nullptr) {
     GELOGE(GE_GRAPH_PARAM_NULLPTR, "[IsCheckpointGraph] computeGraph is nullptr.");
@@ -1268,6 +1377,10 @@ bool GraphManager::IsCheckpointGraph(ComputeGraphPtr &compute_graph) {
       if (!CheckTransOpForCheckpointGraph(node)) {
         return false;
       }
+    } else if (op->GetType() == CONSTANTOP) {
+      if (!CheckConstanOpForCheckpointGraph(node)) {
+        return false;
+      }
     } else if (op->GetType() != kSend && op->GetType() != kRecv) {
       GELOGI("this node is not allow in checkpoint sub graph, node_type: %s, node_name: %s.", op->GetType().c_str(),
              op->GetName().c_str());
@@ -1439,8 +1552,6 @@ Status GraphManager::OptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_gra
   names_to_passes.emplace_back("ReshapeRemovePass", &trans_op_nearby_allreduce_fusion_pass);
   ReshapeRemovePass reshape_remove_pass;
   names_to_passes.emplace_back("ReshapeRemovePass", &reshape_remove_pass);
-  ReplaceWithEmptyConstPass replace_with_empty_const_pass;
-  names_to_passes.emplace_back("ReplaceWithEmptyConstPass", &replace_with_empty_const_pass);
   ConstantFoldingPass constant_folding_pass;
   names_to_passes.emplace_back("ConstantFoldingPass", &constant_folding_pass);
   DimensionAdjustPass dimension_adjust_pass;
@@ -1632,6 +1743,51 @@ Status GraphManager::RunGraphAsync(const GraphId &graph_id, const std::vector<ge
   return SUCCESS;
 }
 
+void GraphManager::AddModelCacheHelperToMap(const GraphId &graph_id, uint64_t session_id,
+                                            ComputeGraphPtr &compute_graph) {
+  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
+  if (instance_ptr != nullptr && instance_ptr->IsIncreBuild()) {
+    auto iter = cache_helper_map_.find(graph_id);
+    if (iter == cache_helper_map_.end()) {
+      ModelCacheHelperPtr cache_helper = MakeShared<ge::ModelCacheHelper>(session_id, graph_id, compute_graph);
+      if (cache_helper != nullptr) {
+        cache_helper_map_.emplace(std::make_pair(graph_id, cache_helper));
+      } else {
+        GELOGW("Cache helper make shared failed, graph_id = %u.", graph_id);
+      }
+    }
+  }
+}
+
+Status GraphManager::IncreBuild(const GraphNodePtr &graph_node, GeModelPtr &ge_model) {
+  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
+  if (instance_ptr == nullptr || !instance_ptr->IsIncreBuild()) {
+    return FAILED;
+  }
+  const uint32_t graph_id = graph_node->GetGraphId();
+  auto iter = cache_helper_map_.find(graph_id);
+  if (iter == cache_helper_map_.end()) {
+    GELOGW("Can not find ModelCacheHelper of graph[%u]", graph_id);
+    return FAILED;
+  }
+  ModelCacheHelperPtr cache_helper = iter->second;
+  if (cache_helper->IsModelCacheHit()) {
+    GEEVENT("Model cache hit.");
+    Status ret = LoadFromCache(graph_node, cache_helper, ge_model);
+    if (ret == SUCCESS) {
+      return SUCCESS;
+    } else {
+      GELOGW("Error occurred when load from cache, abandon.");
+    }
+  } else {
+    GEEVENT("Model cache miss.");
+  }
+  if (SaveCacheBeforeBuild(graph_node->GetGraphId(), cache_helper) != SUCCESS) {
+    GELOGW("Error occurred when save cache.");
+  }
+  return FAILED;
+}
+
 void GraphManager::PreRunThread(GraphManager *graph_manager) {
   if (prctl(PR_SET_NAME, ("GE_PreRun")) != 0) {
     GELOGW("Set thread name failed.");
@@ -1685,6 +1841,8 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
         return;
       }
     }
+    // when set incre build, save cache helper.
+    graph_manager->AddModelCacheHelperToMap(args.graph_id, args.session_id, compute_graph_tmp);
 
     std::vector<GeModelPtr> ge_models;
 
@@ -1707,12 +1865,15 @@ void GraphManager::PreRunThread(GraphManager *graph_manager) {
         return;
       }
 
-      ret = graph_manager->PreRun(graph_node, ge_inputs, ge_models, ge_model, args.session_id);
-      if (ret != SUCCESS) {
-        graph_node->SetRunFlag(false);
-        ReturnError(graph_manager, args.callback, ret, "PreRun failed, thread exit.");
-        graph_node->Unlock();
-        return;
+      // check need incre build.
+      if (graph_manager->IncreBuild(graph_node, ge_model) != SUCCESS) {
+        ret = graph_manager->PreRun(graph_node, ge_inputs, ge_models, ge_model, args.session_id);
+        if (ret != SUCCESS) {
+          graph_node->SetRunFlag(false);
+          ReturnError(graph_manager, args.callback, ret, "PreRun Failed, thread exit..");
+          graph_node->Unlock();
+          return;
+        }
       }
       graph_node->SetBuildFlag(true);
       graph_manager->var_acc_ctrl_.SetGraphBuildEnd(graph_node->GetGraphId());
diff --git a/src/ge/graph/manager/graph_manager.h b/src/ge/graph/manager/graph_manager.h
index 5a296b91..92ea48c5 100644
--- a/src/ge/graph/manager/graph_manager.h
+++ b/src/ge/graph/manager/graph_manager.h
@@ -27,6 +27,7 @@
 
 #include "common/blocking_queue.h"
 #include "common/ge_inner_error_codes.h"
+#include "common/helper/model_cache_helper.h"
 #include "external/graph/types.h"
 #include "ge/ge_api_types.h"
 #include "graph/build/graph_builder.h"
@@ -211,7 +212,8 @@ class GraphManager {
 
   Status SummaryHandle(const GraphId &graph_id, std::vector<GeTensor> &outputs);
 
-  Status CheckpointHandle(const GraphId &graph_id, const std::vector<GeTensor> &outputs);
+  Status CheckpointHandle(const GraphId &graph_id, const ComputeGraphPtr &compute_graph,
+                          const std::vector<GeTensor> &outputs);
 
   // call the callback function of ME to push summary result data to ME
   Status PushSummaryData2ME(const GraphId &graph_id, const std::map<std::string, ge::Tensor> &summary_data);
@@ -260,6 +262,13 @@ class GraphManager {
 
   bool IsGraphNeedBuild(const GraphNodePtr &graph_node);
 
+  Status LoadFromCache(const GraphNodePtr &graph_node, const ModelCacheHelperPtr &cache_helper, GeModelPtr &ge_model);
+  Status SaveCacheBeforeBuild(uint32_t graph_id, const ModelCacheHelperPtr &cache_helper);
+  Status SaveCacheAfterBuild(uint32_t graph_id, ComputeGraphPtr graph, GeModelPtr &ge_model);
+  void AddModelCacheHelperToMap(const GraphId &graph_id, uint64_t session_id, ComputeGraphPtr &compute_graph);
+  Status IncreBuild(const GraphNodePtr &graph_node, GeModelPtr &ge_model);
+  void RemoveModelCacheHelper(const GraphId &graph_id);
+
   static void PreRunThread(GraphManager *graph_manager);
   static void RunThread(GraphManager *graph_manager);
   static void StopQueue(GraphManager *graph_manager);
@@ -274,6 +283,8 @@ class GraphManager {
 
   std::map<GraphId, GraphNodePtr> graph_map_;
 
+  std::map<GraphId, ModelCacheHelperPtr> cache_helper_map_;
+
   // for run graph synchronous return
   std::mutex sync_run_mutex_;
   std::condition_variable condition_;
diff --git a/src/ge/graph/manager/graph_var_manager.cc b/src/ge/graph/manager/graph_var_manager.cc
index d5ffbd03..f40ca7ce 100644
--- a/src/ge/graph/manager/graph_var_manager.cc
+++ b/src/ge/graph/manager/graph_var_manager.cc
@@ -64,6 +64,10 @@ ge::Status VarResource::GetVarAddr(const std::string &var_name, const ge::GeTens
   return SUCCESS;
 }
 
+void VarResource::GetAllVarAddrMgr(std::unordered_map<std::string, VarAddrMgr> &var_addr_mgr_map) {
+  var_addr_mgr_map = var_addr_mgr_map_;
+}
+
 void VarResource::SetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *dev_ptr,
                              rtMemType_t memory_type) {
   std::string var_key = VarKey(var_name, tensor_desc);
@@ -170,6 +174,14 @@ void VarResource::SaveBroadCastInfo(uint32_t graph_id, const VarBroadCastInfo &b
   var_broad_cast_info_[graph_id][broad_cast_info.var_name] = broad_cast_info;
 }
 
+ge::Status VarResource::GetBroadCastInfo(uint32_t graph_id, const string &var_name, VarBroadCastInfo &broad_cast_info) {
+  if (var_broad_cast_info_.count(graph_id) == 0 || var_broad_cast_info_[graph_id].count(var_name) == 0) {
+    return FAILED;
+  }
+  broad_cast_info = var_broad_cast_info_[graph_id][var_name];
+  return SUCCESS;
+}
+
 ge::Status VarResource::SyncVarData2BroadCast(uint32_t graph_id, const std::string &var_name,
                                               const ge::ConstOpDescPtr &var_op_desc, uint8_t *base_ptr) {
   if (var_op_desc == nullptr) {
@@ -282,11 +294,17 @@ Status MemResource::AssignVarMem(const std::string &var_name, uint64_t size, uin
 
   // align 512 BYTE
   var_mem_size_ = var_mem_size_ + kSessionMemAlignSize;
+  GELOGI(
+    "[IMAS]AssignVarMem Set session_%lu name[%s] output[%d]"
+    "offset to [%zu] size[%lu] realsize[%lu].",
+    session_id, var_name.c_str(), 0, mem_offset, (var_mem_size_ - mem_offset), real_size);
   return SUCCESS;
 }
 
 int64_t MemResource::GetVarMemSize() const { return var_mem_size_; }
 
+void MemResource::UpdateVarMemSize(int64_t mem_size) { var_mem_size_ = mem_size; };
+
 VarManager::VarManager(uint64_t session_id)
     : version_(SessionVersion::OTHER_VERSION),
       session_id_(session_id),
@@ -363,6 +381,21 @@ ge::Status VarManager::SetVarAddr(const std::string &var_name, const ge::GeTenso
   return ge::SUCCESS;
 }
 
+ge::Status VarManager::SaveVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *address,
+                                   rtMemType_t memory_type) {
+  GELOGI("VarManager::SaveVarAddr var_name = %s, data_type = %s, data_format = %s.", var_name.c_str(),
+         ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(),
+         ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str());
+
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  if (var_resource_ == nullptr) {
+    GELOGW("VarManager has not been init.");
+    return ge::INTERNAL_ERROR;
+  }
+  var_resource_->SaveVarAddr(var_name, tensor_desc, address, memory_type);
+  return ge::SUCCESS;
+}
+
 ge::Status VarManager::GetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t **dev_ptr,
                                   rtMemType_t &memory_type) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
@@ -388,6 +421,10 @@ ge::Status VarManager::GetVarAddr(const std::string &var_name, const ge::GeTenso
   return GetVarAddr(var_name, tensor_desc, dev_ptr, memory_type);
 }
 
+void VarManager::GetAllVarAddrMgr(std::unordered_map<std::string, VarAddrMgr> &var_addr_mgr_map) {
+  var_resource_->GetAllVarAddrMgr(var_addr_mgr_map);
+}
+
 int64_t VarManager::GetVarMemSize(rtMemType_t memory_type) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   MemResource *mem_resource = nullptr;
@@ -405,14 +442,36 @@ int64_t VarManager::GetVarMemSize(rtMemType_t memory_type) {
   return mem_resource->GetVarMemSize();
 }
 
+Status VarManager::UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  MemResource *mem_resource = nullptr;
+  auto iter = mem_resource_map_.find(memory_type);
+  if (iter == mem_resource_map_.end()) {
+    mem_resource = new (std::nothrow) MemResource();
+    if (mem_resource == nullptr) {
+      GELOGE(ge::INTERNAL_ERROR, "Alloc MemResource failed, memory_type = %u.", memory_type);
+      return ge::INTERNAL_ERROR;
+    } else {
+      mem_resource_map_[memory_type] = mem_resource;
+    }
+  } else {
+    mem_resource = iter->second;
+  }
+
+  if (mem_resource == nullptr) {
+    GELOGE(ge::INTERNAL_ERROR, "MemResource is invalid.");
+    return FAILED;
+  }
+  mem_resource->UpdateVarMemSize(mem_size);
+  return SUCCESS;
+}
+
 ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTensorDesc &tensor_desc,
                                     rtMemType_t memory_type) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
-  GELOGI(
-    "VarManager::AssignVarMem var_name = %s, data_type = %s, data_format = "
-    "%s.",
-    var_name.c_str(), ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(),
-    ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str());
+  GELOGI("VarManager::AssignVarMem var_name = %s, data_type = %s, data_format = %s.", var_name.c_str(),
+         ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(),
+         ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str());
 
   int64_t tensor_desc_size = 0;
   size_t mem_offset = 0;
@@ -475,14 +534,13 @@ ge::Status VarManager::AssignVarMem(const std::string &var_name, const ge::GeTen
   if (cur_tensor_desc.GetFormat() != tensor_desc.GetFormat() ||
       cur_tensor_desc.GetDataType() != tensor_desc.GetDataType() ||
       cur_tensor_desc.GetShape().GetDims() != tensor_desc.GetShape().GetDims()) {
-    GELOGI(
-      "var %s assigned new memory (format, data type, shape)  (%s, %s, "
-      "%zu) from (%s, %s, %zu)",
-      var_name.c_str(), ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(),
-      ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str(), tensor_desc.GetShape().GetDims().size(),
-      ge::TypeUtils::DataTypeToSerialString(cur_tensor_desc.GetDataType()).c_str(),
-      ge::TypeUtils::FormatToSerialString(cur_tensor_desc.GetFormat()).c_str(),
-      cur_tensor_desc.GetShape().GetDims().size());
+    GELOGI("var %s assigned new memory (format, data type, shape)  (%s, %s, %zu) from (%s, %s, %zu)", var_name.c_str(),
+           ge::TypeUtils::DataTypeToSerialString(tensor_desc.GetDataType()).c_str(),
+           ge::TypeUtils::FormatToSerialString(tensor_desc.GetFormat()).c_str(),
+           tensor_desc.GetShape().GetDims().size(),
+           ge::TypeUtils::DataTypeToSerialString(cur_tensor_desc.GetDataType()).c_str(),
+           ge::TypeUtils::FormatToSerialString(cur_tensor_desc.GetFormat()).c_str(),
+           cur_tensor_desc.GetShape().GetDims().size());
     var_resource_->SetVarAddr(var_name, tensor_desc,
                               reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(mem_offset)), memory_type);
   }
@@ -550,6 +608,16 @@ ge::Status VarManager::SaveBroadCastInfo(uint32_t graph_id, const VarBroadCastIn
   return SUCCESS;
 }
 
+ge::Status VarManager::GetBroadCastInfo(uint32_t graph_id, const string &var_name, VarBroadCastInfo &broad_cast_info) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+
+  if (var_resource_ == nullptr) {
+    GELOGW("VarManager has not been init.");
+    return ge::INTERNAL_ERROR;
+  }
+  return var_resource_->GetBroadCastInfo(graph_id, var_name, broad_cast_info);
+}
+
 ge::Status VarManager::RenewCurVarDesc(const std::string &var_name, ge::OpDescPtr op_desc) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   GELOGD("VarManager::RenewCurVarDesc var_name = %s.", var_name.c_str());
@@ -672,6 +740,7 @@ Status VarManager::SetMemoryMallocSize(const map<string, string> &options) {
       GELOGE(ge::GE_GRAPH_OPTIONS_INVALID, "Parse graph memory manager malloc max size failed.");
       return ge::GE_GRAPH_OPTIONS_INVALID;
     }
+    GELOGI("The max size for graph mem is set to %zu", graph_mem_max_size_);
   }
 
   it = options.find(VARIABLE_MEMORY_MAX_SIZE);
diff --git a/src/ge/graph/manager/graph_var_manager.h b/src/ge/graph/manager/graph_var_manager.h
index a2b974e4..8b551e06 100644
--- a/src/ge/graph/manager/graph_var_manager.h
+++ b/src/ge/graph/manager/graph_var_manager.h
@@ -101,6 +101,8 @@ class VarResource {
   ge::Status GetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t **dev_ptr,
                         rtMemType_t &memory_type);
 
+  void GetAllVarAddrMgr(std::unordered_map<std::string, VarAddrMgr> &var_addr_mgr_map);
+
   void SetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *dev_ptr,
                   rtMemType_t rtMemType_t);
 
@@ -113,6 +115,8 @@ class VarResource {
 
   void SaveBroadCastInfo(uint32_t graph_id, const VarBroadCastInfo &broad_cast_info);
 
+  ge::Status GetBroadCastInfo(uint32_t graph_id, const string &var_name, VarBroadCastInfo &broad_cast_info);
+
   ge::Status SyncVarData2BroadCast(uint32_t graph_id, const std::string &var_name,
                                    const ge::ConstOpDescPtr &var_op_desc, uint8_t *base_ptr);
 
@@ -175,6 +179,8 @@ class MemResource {
 
   int64_t GetVarMemSize() const;
 
+  void UpdateVarMemSize(int64_t mem_size);
+
  private:
   uint64_t total_size_;
   uint64_t var_mem_size_;
@@ -196,9 +202,14 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY VarManager {
   ge::Status SetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *dev_ptr,
                         rtMemType_t memory_type);
 
+  ge::Status SaveVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t *address,
+                         rtMemType_t memory_type);
+
   ge::Status GetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t **dev_ptr,
                         rtMemType_t &memory_type);
 
+  void GetAllVarAddrMgr(std::unordered_map<std::string, VarAddrMgr> &var_addr_mgr_map);
+
   ge::Status GetVarAddr(const std::string &var_name, const ge::GeTensorDesc &tensor_desc, uint8_t **dev_ptr);
 
   ge::Status SyncVarData(uint32_t graph_id, const std::string &var_name, ge::ConstOpDescPtr var_op_desc,
@@ -206,6 +217,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY VarManager {
 
   ge::Status SaveBroadCastInfo(uint32_t graph_id, const VarBroadCastInfo &broad_cast_info);
 
+  ge::Status GetBroadCastInfo(uint32_t graph_id, const string &var_name, VarBroadCastInfo &broad_cast_info);
+
   ge::Status SyncBroadCastData2Var(uint32_t graph_id, const std::string &var_name, ge::ConstOpDescPtr var_op_desc,
                                    uint8_t *base_ptr);
 
@@ -251,6 +264,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY VarManager {
 
   int64_t GetVarMemSize(rtMemType_t memory_type);
 
+  Status UpdateVarMemSize(rtMemType_t memory_type, int64_t mem_size);
+
   bool IsVarExist(const std::string &var_name, const ge::GeTensorDesc &tensor_desc);
 
   bool IsVarExist(const std::string &var_name);
diff --git a/src/ge/graph/partition/graph_partition.cc b/src/ge/graph/partition/graph_partition.cc
index feced331..bc8c9b9b 100644
--- a/src/ge/graph/partition/graph_partition.cc
+++ b/src/ge/graph/partition/graph_partition.cc
@@ -238,6 +238,14 @@ Status ge::GraphPartitioner::MergeSubGraph(ge::ComputeGraphPtr &output_merged_co
     return FAILED;
   }
   GE_TIMESTAMP_END(MergeGraphTopologicalSorting, "GraphPartitioner::MergeGraphTopologicalSorting");
+  // flush all nodes' engine of merged graph
+  GE_TIMESTAMP_START(MergeGraphEnginePlacerRun);
+  graph_info_.engine_placer_.SetComputeGraph(output_merged_compute_graph);
+  if (graph_info_.engine_placer_.Run() != SUCCESS) {
+    GELOGE(GE_GRAPH_INIT_FAILED, "[GraphPartitioner]: engine_placer run failed");
+    return FAILED;
+  }
+  GE_TIMESTAMP_END(MergeGraphEnginePlacerRun, "GraphPartitioner::MergeGraphEnginePlacerRun");
   GELOGI("Graph merge ends.");
   return SUCCESS;
 }
diff --git a/src/ge/graph/passes/atomic_addr_clean_pass.cc b/src/ge/graph/passes/atomic_addr_clean_pass.cc
index 6c312efa..e95f0680 100644
--- a/src/ge/graph/passes/atomic_addr_clean_pass.cc
+++ b/src/ge/graph/passes/atomic_addr_clean_pass.cc
@@ -200,7 +200,18 @@ bool AtomicAddrCleanPass::IsAtomicOp(const NodePtr &node) {
   vector<OpInfo> op_info_vec = ops_kernel_manager.GetOpsKernelInfo(op_desc->GetType());
   for (const auto &op_info : op_info_vec) {
     if (op_info.isAtomic) {
-      GELOGI("Recognized atomic op %s from HCCL engine.", op_desc->GetName().c_str());
+      GELOGI("Recognized atomic op %s from DNN_HCCL engine.", op_desc->GetName().c_str());
+      // check peer input is DATA
+      for (auto &in_data_anchor : node->GetAllInDataAnchors()) {
+        if (in_data_anchor->GetPeerOutAnchor() != nullptr &&
+            in_data_anchor->GetPeerOutAnchor()->GetOwnerNode() != nullptr) {
+          auto peer_in_node = in_data_anchor->GetPeerOutAnchor()->GetOwnerNode();
+          if (peer_in_node->GetType() == DATA) {
+            GELOGI("Recognized atomic op %s from DNN_HCCL engine and input is DATA.", op_desc->GetName().c_str());
+            return false;
+          }
+        }
+      }
       hcom_node_vec_.push_back(node);
       return true;
     }
diff --git a/src/ge/graph/passes/folding_kernel/cast_kernel.cc b/src/ge/graph/passes/folding_kernel/cast_kernel.cc
index bcd26f70..99944c20 100644
--- a/src/ge/graph/passes/folding_kernel/cast_kernel.cc
+++ b/src/ge/graph/passes/folding_kernel/cast_kernel.cc
@@ -49,9 +49,11 @@ Status CastKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector<ConstG
     GELOGE(PARAM_INVALID, "Input const_weight_ptr is nullptr.");
     return PARAM_INVALID;
   }
+
   const uint8_t *src_data = const_weight_ptr->GetData().data();
-  if (op_desc_ptr == nullptr || src_data == nullptr) {
-    GELOGE(PARAM_INVALID, "Parameter's invalid, Input opDescPtr or src_data is nullptr.");
+  // src_data == nullptr is supported
+  if (op_desc_ptr == nullptr) {
+    GELOGE(PARAM_INVALID, "Parameter's invalid, Input opDescPtr is nullptr.");
     return PARAM_INVALID;
   }
   GeTensorDesc op_desc = op_desc_ptr->GetOutputDesc(0);
@@ -73,7 +75,7 @@ Status CastKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector<ConstG
     TypeUtils::FormatToSerialString(data_format).c_str(), formats::ShapeToString(data_shape).c_str(),
     TypeUtils::DataTypeToSerialString(data_type).c_str());
 
-  GE_CHECK_SIZE(const_weight_ptr->GetData().GetSize());
+  // const_weight_ptr->GetData().GetSize() == 0 is supported
   auto src_data_size = src_shape.GetShapeSize();
   if (src_data_size == 0 &&
       static_cast<int>(const_weight_ptr->GetData().GetSize()) == GetSizeByDataType(src_data_type)) {
@@ -113,7 +115,6 @@ Status CastKernel::Compute(const OpDescPtr op_desc_ptr, const std::vector<ConstG
   }
   if (output_ptr->SetData(trans_result.data.get(), trans_result.length) != SUCCESS) {
     GELOGW("Compute: SetData failed");
-    return FAILED;
   }
   v_output.push_back(output_ptr);
   return SUCCESS;
diff --git a/src/ge/graph/passes/folding_kernel/kernel_utils.cc b/src/ge/graph/passes/folding_kernel/kernel_utils.cc
index 9448b232..2002643a 100644
--- a/src/ge/graph/passes/folding_kernel/kernel_utils.cc
+++ b/src/ge/graph/passes/folding_kernel/kernel_utils.cc
@@ -113,12 +113,26 @@ bool KernelUtils::CheckSizeForTransOp(const ge::ConstGeTensorPtr &const_weight_p
 
   GELOGI("Const real value Size:%zu, op_desc Shape Size:%ld, data_type:%s.", data_size, cal_size,
          TypeUtils::DataTypeToSerialString(data_type).c_str());
-  if ((shape_size != 0) || (length != 0 && (data_size / static_cast<size_t>(length) != 1))) {
-    if (!(data_size == static_cast<size_t>(cal_size) && data_size != 0)) {
+  if (shape_size != 0) {
+    // Standard tensor
+    if (data_size != static_cast<size_t>(cal_size) || data_size == 0) {
+      GELOGW("Const input data size is not equal with tensor desc shape");
+      return false;
+    }
+  } else if (data_shape.GetDimNum() != 0) {
+    // Empty tensor, has zero in shape vector
+    if (data_size != 0) {
+      GELOGW("Const input data size is not equal with tensor desc shape");
+      return false;
+    }
+  } else {
+    // Scalar tensor, has only one element in tensor
+    if (length != 0 && (data_size / static_cast<size_t>(length) != 1)) {
       GELOGW("Const input data size is not equal with tensor desc shape");
       return false;
     }
   }
+
   return true;
 }
 
diff --git a/src/ge/graph/passes/folding_kernel/kernel_utils.h b/src/ge/graph/passes/folding_kernel/kernel_utils.h
index 9eadf4ca..17b645aa 100644
--- a/src/ge/graph/passes/folding_kernel/kernel_utils.h
+++ b/src/ge/graph/passes/folding_kernel/kernel_utils.h
@@ -29,6 +29,7 @@ namespace ge {
 class KernelUtils {
  public:
   KernelUtils() = delete;
+  ~KernelUtils() = delete;
   static Status CheckDimensionNodeInfo(const NodePtr &node_ptr);
   static bool CheckFormatSupported(const NodePtr &node_ptr);
   static bool CheckSizeForTransOp(const ConstGeTensorPtr &const_weight_ptr, const OpDescPtr &op_desc_ptr);
@@ -41,7 +42,7 @@ class KernelUtils {
    * @param [out] output the tensor for save sequence of numbers
    * @author
    */
-  template<typename T>
+  template <typename T>
   static Status GenData(const int64_t data_num, const T value, const GeTensorPtr &output) {
     if (data_num > 0) {
       if (!CheckInt64MulOverflow(data_num, static_cast<int64_t>(sizeof(T)))) {
@@ -69,12 +70,12 @@ class KernelUtils {
   }
 
   /**
-  * Calculate dimension
-  * @param [in] dims save the tensor of the dimension
-  * @param [in] vec_dim results of each dimension
-  * @param [out] data_num total size of data
-  * @author
-  */
+   * Calculate dimension
+   * @param [in] dims save the tensor of the dimension
+   * @param [in] vec_dim results of each dimension
+   * @param [out] data_num total size of data
+   * @author
+   */
   template <typename T>
   static Status CalcDims(const ConstGeTensorPtr dims, std::vector<int64_t> &vec_dim, int64_t &data_num) {
     data_num = 1;
diff --git a/src/ge/graph/passes/folding_kernel/pack_kernel.cc b/src/ge/graph/passes/folding_kernel/pack_kernel.cc
index c79acd76..5db3b394 100644
--- a/src/ge/graph/passes/folding_kernel/pack_kernel.cc
+++ b/src/ge/graph/passes/folding_kernel/pack_kernel.cc
@@ -67,8 +67,8 @@ Status PackKernel::ValidateKernelParams(const ge::OpDescPtr &op_desc_ptr,
     return PARAM_INVALID;
   }
   if (!(AttrUtils::GetInt(op_desc_ptr, PACK_ATTR_NAME_NUM, n_))) {
-    GELOGE(PARAM_INVALID, "Attr %s is not exist.", PACK_ATTR_NAME_NUM.c_str());
-    return PARAM_INVALID;
+    n_ = 0;
+    GELOGD("Attr %s is not set, default value %ld is used.", PACK_ATTR_NAME_NUM.c_str(), n_);
   }
   if (!(AttrUtils::GetInt(op_desc_ptr, ATTR_NAME_AXIS, axis_))) {
     GELOGE(PARAM_INVALID, "Attr %s is not exist.", ATTR_NAME_AXIS.c_str());
@@ -105,11 +105,7 @@ Status PackKernel::ValidateInputs(const ge::OpDescPtr &op_desc_ptr, const std::v
       GELOGW("Input %ld of pack kernel %s is null.", i, op_desc_ptr->GetName().c_str());
       return PARAM_INVALID;
     }
-    // check if tensor contains data
-    if (input[i]->GetData().size() == 0) {
-      GELOGW("Inputs %ld do not have value.", i);
-      return NOT_CHANGED;
-    }
+
     if (i == 0) {
       // get first input shape
       shape = input[0]->GetTensorDesc().GetShape();
@@ -127,8 +123,8 @@ Status PackKernel::ValidateInputs(const ge::OpDescPtr &op_desc_ptr, const std::v
     auto dst_shape = tensor_desc.GetShape();
     int64_t num = 1;
     for (auto dim : dst_shape.GetDims()) {
-      if (dim < 1) {
-        GELOGW("Invalid zero dim in the shape %s", formats::ShapeToString(shape).c_str());
+      if (dim < 0) {
+        GELOGW("Invalid dim ld% in the shape %s", dim, formats::ShapeToString(shape).c_str());
         return NOT_CHANGED;
       }
       num *= dim;
@@ -141,6 +137,12 @@ Status PackKernel::ValidateInputs(const ge::OpDescPtr &op_desc_ptr, const std::v
       GELOGW("Shape of input %ld is not equal wiht input 0.", i);
       return NOT_CHANGED;
     }
+
+    // check tensor data size is zero ot not
+    if (input[i]->GetData().size() == 0 && num != 0) {
+      GELOGW("Inputs %ld do not have value.", i);
+      return NOT_CHANGED;
+    }
   }
   return SUCCESS;
 }
@@ -167,6 +169,13 @@ void PackKernel::ExpandDims(const int64_t axis, const std::vector<ge::ConstGeTen
 
 Status PackKernel::CopyOutputData(const GeShape &final_shape, const std::vector<ge::ConstGeTensorPtr> &input,
                                   ge::GeTensorPtr &output_ptr) {
+  output_ptr->MutableTensorDesc().SetShape(final_shape);
+  output_ptr->MutableTensorDesc().SetDataType(DataType(data_type_));
+  if (final_shape.GetShapeSize() == 0 && final_shape.GetDims().size() != 0) {
+    // means has zero in shape list, output tnesor data is [].
+    return SUCCESS;
+  }
+
   int64_t times = 1;
   int64_t unit = 1;
   // calculate data unit
@@ -210,8 +219,6 @@ Status PackKernel::CopyOutputData(const GeShape &final_shape, const std::vector<
   if (output_ptr->SetData(buf.get(), static_cast<size_t>(output_size * data_size)) != GRAPH_SUCCESS) {
     GELOGW("CopyOutputData: SetData failed");
   }
-  output_ptr->MutableTensorDesc().SetShape(final_shape);
-  output_ptr->MutableTensorDesc().SetDataType(DataType(data_type_));
   return SUCCESS;
 }
 
diff --git a/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc b/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc
index 76a67dac..b7fd11b1 100644
--- a/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc
+++ b/src/ge/graph/passes/folding_kernel/reduce_prod_kernel.cc
@@ -63,10 +63,7 @@ Status ReduceProdKernel::ReduceProdCheck(const ge::OpDescPtr &op_desc_ptr,
     GELOGE(PARAM_INVALID, "Axis must be at most rank 1, node node: %s", op_desc_ptr->GetName().c_str());
     return PARAM_INVALID;
   }
-  if (data_tensor->GetData().size() == 0 || axis_tensor->GetData().size() == 0) {
-    GELOGE(PARAM_INVALID, "ReduceProdKernel data size of inputs is 0, node node: %s", op_desc_ptr->GetName().c_str());
-    return PARAM_INVALID;
-  }
+
   DataType data_type = data_tensor->GetTensorDesc().GetDataType();
   if (kReduceProdSupportedType.find(data_type) == kReduceProdSupportedType.end()) {
     GELOGE(PARAM_INVALID, "ReduceProdKernel data type %s not support, node name: %s",
@@ -151,7 +148,6 @@ Status ReduceProdKernel::DataCal(const std::vector<ge::ConstGeTensorPtr> &input,
                                         static_cast<size_t>(head_dim_ * end_dim_ * sizeof(int32_t))) != GRAPH_SUCCESS,
                     GELOGW("set data failed");
                     return INTERNAL_ERROR);
-    output_ptr->MutableTensorDesc().SetDataType(data_dtype);
   }
   return SUCCESS;
 }
@@ -260,19 +256,32 @@ Status ReduceProdKernel::Compute(const ge::OpDescPtr op_desc_ptr, const std::vec
     if (ret != SUCCESS) {
       return NOT_CHANGED;
     }
+  } else if (input.at(kReduceProdAxisIndex)->GetData().size() == 0) {
+    // axis tensor value is [], means no process for input
+    output_ptr->MutableTensorDesc().SetShape(input.at(kReduceProdDataIndex)->GetTensorDesc().GetShape());
+    output_ptr->MutableTensorDesc().SetDataType(input.at(kReduceProdDataIndex)->GetTensorDesc().GetDataType());
+    if (output_ptr->SetData(input.at(kReduceProdDataIndex)->GetData()) != GRAPH_SUCCESS) {
+      GELOGW("Compute: SetData failed");
+    }
   } else {
     // calculate axis to reduce
     ret = AxisCal(input);
     if (ret != SUCCESS) {
       return NOT_CHANGED;
     }
-    // calculate data and data type
-    ret = DataCal(input, output_ptr);
-    if (ret != SUCCESS) {
-      return NOT_CHANGED;
-    }
-    // calculate shape
+    // calculate and set shape
     ShapeCal(op_desc_ptr, input, output_ptr);
+    // set data type
+    output_ptr->MutableTensorDesc().SetDataType(input.at(kReduceProdDataIndex)->GetTensorDesc().GetDataType());
+
+    // data size == 0 means input tensor has zero in shape, and tensor value is [].
+    if (input.at(kReduceProdDataIndex)->GetData().size() != 0) {
+      // calculate data and data type
+      ret = DataCal(input, output_ptr);
+      if (ret != SUCCESS) {
+        return NOT_CHANGED;
+      }
+    }
   }
 
   // print output tensor information, and will be deleted
diff --git a/src/ge/graph/passes/folding_kernel/transdata_kernel.cc b/src/ge/graph/passes/folding_kernel/transdata_kernel.cc
index b1bfe92d..d3637169 100644
--- a/src/ge/graph/passes/folding_kernel/transdata_kernel.cc
+++ b/src/ge/graph/passes/folding_kernel/transdata_kernel.cc
@@ -48,8 +48,9 @@ Status TransdataKernel::ValidateInput(const OpDescPtr &op_desc_ptr, const std::v
     GELOGE(PARAM_INVALID, "Input const_weight_ptr is nullptr.");
     return PARAM_INVALID;
   }
-  const uint8_t *src_data = const_weight_ptr->GetData().data();
-  if (op_desc_ptr == nullptr || src_data == nullptr) {
+
+  // src_data == nullptr is supported
+  if (op_desc_ptr == nullptr) {
     GELOGE(PARAM_INVALID, "Input opDescPtr is nullptr.");
     return PARAM_INVALID;
   }
diff --git a/src/ge/graph/passes/pass_utils.h b/src/ge/graph/passes/pass_utils.h
index a8b1cfe3..b889a056 100644
--- a/src/ge/graph/passes/pass_utils.h
+++ b/src/ge/graph/passes/pass_utils.h
@@ -26,6 +26,7 @@ namespace ge {
 class PassUtils {
  public:
   PassUtils() = delete;
+  ~PassUtils() = delete;
 
   static NodePtr GetInDataNode(const ConstNodePtr &node, int index);
 
diff --git a/src/ge/graph/passes/switch_op_pass.cc b/src/ge/graph/passes/switch_op_pass.cc
index 5ed1cb1c..b21f962b 100644
--- a/src/ge/graph/passes/switch_op_pass.cc
+++ b/src/ge/graph/passes/switch_op_pass.cc
@@ -137,7 +137,7 @@ Status SwitchOpPass::ReplaceSwitchNode(ComputeGraphPtr &graph, NodePtr &switch_n
       NodePtr out_node = peer_in_anchor->GetOwnerNode();
       GE_CHK_STATUS_RET(GetOriginalType(out_node, type), "Get node type fail.");
       if ((type == MERGE) || (type == REFMERGE)) {
-        NodePtr memcpy_node = CreateMemcpyAsyncNode(graph, peer_data_anchor);
+        NodePtr memcpy_node = CreateMemcpyAsyncNode(graph, peer_data_anchor, false);
         GE_CHK_BOOL_EXEC(memcpy_node != nullptr, return FAILED, "Create memcpy_async node fail.");
         GE_CHK_STATUS(GraphUtils::AddEdge(peer_data_anchor, memcpy_node->GetInDataAnchor(0)),
                       "MemcpyAsync node add edge fail.");
@@ -234,16 +234,18 @@ Status SwitchOpPass::ReplaceMergeNode(ComputeGraphPtr &graph, NodePtr &merge_nod
     need_label_nodes_.emplace_back(stream_merge);
   }
 
+  bool multi_batch_flag = false;
   if (merge_op_desc->HasAttr(ATTR_INSERT_BY_MBATCH)) {
     if (!ge::AttrUtils::SetBool(op_desc, ATTR_INSERT_BY_MBATCH, true)) {
       GELOGE(FAILED, "Set attr ATTR_INSERT_BY_MBATCH fail, StreamMerge:%s.", node_name.c_str());
       return FAILED;
     }
+    multi_batch_flag = true;
   }
 
   (void)bypass_nodes_.insert(merge_node);
 
-  GE_CHK_STATUS_RET(AddMemcpyAsyncNodes(graph, stream_merge), "StreamMerge add memcpy node fail.");
+  GE_CHK_STATUS_RET(AddMemcpyAsyncNodes(graph, stream_merge, multi_batch_flag), "StreamMerge add memcpy node fail.");
 
   return SUCCESS;
 }
@@ -302,17 +304,20 @@ NodePtr SwitchOpPass::CreateStreamSwitchNode(ComputeGraphPtr &graph, const NodeP
 /// @brief Add MemcpyAsync Node
 /// @param [in] graph
 /// @param [in] in_node
+/// @param [in] multi_batch_flag
 /// @return ge::NodePtr
 ///
-NodePtr SwitchOpPass::CreateMemcpyAsyncNode(ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor) {
+NodePtr SwitchOpPass::CreateMemcpyAsyncNode(ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor,
+                                            bool multi_batch_flag) {
   GE_CHK_BOOL_EXEC(out_data_anchor != nullptr, return nullptr, "Param of input node is null.");
   OpDescPtr pre_op_desc = out_data_anchor->GetOwnerNode()->GetOpDesc();
   GE_CHK_BOOL_EXEC(pre_op_desc != nullptr, return nullptr, "OpDesc of pre node is invalid.");
 
-  std::string node_name = pre_op_desc->GetName() + "_" + MEMCPYASYNC;
+  std::string memcpy_type = multi_batch_flag ? MEMCPYADDRASYNC : MEMCPYASYNC;
+  std::string node_name = pre_op_desc->GetName() + "_" + memcpy_type;
   node_name = CheckDuplicateName(node_name);
   GELOGI("Create MemcpyAsync op:%s.", node_name.c_str());
-  OpDescPtr op_desc = MakeShared<OpDesc>(node_name, MEMCPYASYNC);
+  OpDescPtr op_desc = MakeShared<OpDesc>(node_name, memcpy_type);
   if (op_desc == nullptr) {
     GELOGE(FAILED, "Create op_desc fail, MemcpyAsync:%s.", node_name.c_str());
     return nullptr;
@@ -432,9 +437,10 @@ NodePtr SwitchOpPass::CreateActiveNode(ComputeGraphPtr &graph, NodePtr &node) {
 /// @brief Add MemcpyAsync Op as StreamMerge in_node
 /// @param [in] graph
 /// @param [in] node
+/// @param [in] multi_batch_flag
 /// @return Status
 ///
-Status SwitchOpPass::AddMemcpyAsyncNodes(ComputeGraphPtr &graph, NodePtr &node) {
+Status SwitchOpPass::AddMemcpyAsyncNodes(ComputeGraphPtr &graph, NodePtr &node, bool multi_batch_flag) {
   GE_CHK_BOOL_EXEC(node != nullptr, return FAILED, "Param of pre node is null.");
   for (InDataAnchorPtr &in_data_anchor : node->GetAllInDataAnchors()) {
     OutDataAnchorPtr peer_out_anchor = in_data_anchor->GetPeerOutAnchor();
@@ -447,7 +453,7 @@ Status SwitchOpPass::AddMemcpyAsyncNodes(ComputeGraphPtr &graph, NodePtr &node)
                     continue);
 
     GE_IF_BOOL_EXEC(type != MEMCPYASYNC, {
-      in_node = CreateMemcpyAsyncNode(graph, peer_out_anchor);
+      in_node = CreateMemcpyAsyncNode(graph, peer_out_anchor, multi_batch_flag);
       GE_CHK_BOOL_EXEC(in_node != nullptr, return FAILED, "Create MemcpyAsync node fail.");
       GE_CHK_STATUS(GraphUtils::RemoveEdge(peer_out_anchor, in_data_anchor), "MemcpyAsync node remove edge fail.");
       GE_CHK_STATUS(GraphUtils::AddEdge(peer_out_anchor, in_node->GetInDataAnchor(0)),
diff --git a/src/ge/graph/passes/switch_op_pass.h b/src/ge/graph/passes/switch_op_pass.h
index 14cdd22c..7e107e3b 100644
--- a/src/ge/graph/passes/switch_op_pass.h
+++ b/src/ge/graph/passes/switch_op_pass.h
@@ -103,13 +103,13 @@ class SwitchOpPass : public GraphPass {
   NodePtr CreateStreamSwitchNode(ComputeGraphPtr &graph, const NodePtr &switch_node, const std::string &suffix,
                                  OutDataAnchorPtr &peer_cond_anchor);
 
-  NodePtr CreateMemcpyAsyncNode(ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor);
+  NodePtr CreateMemcpyAsyncNode(ComputeGraphPtr &graph, const OutDataAnchorPtr &out_data_anchor, bool multi_batch_flag);
 
   Status CombineSwitchNode(ComputeGraphPtr &graph);
 
   NodePtr CreateActiveNode(ComputeGraphPtr &graph, NodePtr &node);
 
-  Status AddMemcpyAsyncNodes(ComputeGraphPtr &graph, NodePtr &stream_merge_node);
+  Status AddMemcpyAsyncNodes(ComputeGraphPtr &graph, NodePtr &stream_merge_node, bool multi_batch_flag);
 
   Status BypassSwitchNode(NodePtr &switch_node, OutDataAnchorPtr &peer_data_anchor, OutDataAnchorPtr &peer_cond_anchor);
 
diff --git a/src/ge/graph/passes/variable_prepare_op_pass.cc b/src/ge/graph/passes/variable_prepare_op_pass.cc
index c4ed0405..3a62082a 100644
--- a/src/ge/graph/passes/variable_prepare_op_pass.cc
+++ b/src/ge/graph/passes/variable_prepare_op_pass.cc
@@ -22,11 +22,14 @@
 #include "common/ge/ge_util.h"
 #include "external/graph/graph.h"
 #include "framework/common/debug/ge_log.h"
+#include "graph/common/omg_util.h"
 #include "graph/debug/ge_attr_define.h"
 #include "graph/node.h"
 #include "graph/utils/tensor_utils.h"
 
 namespace ge {
+std::map<std::string, std::map<int, int>> VariablePrepareOpPass::ref_node_without_prototype_map_{
+  {REFSWITCH, {{0, 0}, {0, 1}}}};
 Status VariablePrepareOpPass::Run(ComputeGraphPtr graph) {
   GE_CHECK_NOTNULL(graph);
   for (const auto &node : graph->GetDirectNode()) {
@@ -43,9 +46,7 @@ Status VariablePrepareOpPass::Run(ComputeGraphPtr graph) {
 
   for (auto &node : graph->GetDirectNode()) {
     GE_IF_BOOL_EXEC(node->GetOpDesc() == nullptr, continue);
-    bool is_variable = node->GetOpDesc()->GetType() == VARIABLE;
-    bool is_deal = has_dealed_variable_.find(node->GetName()) == has_dealed_variable_.end();
-    if (is_variable && is_deal) {
+    if (node->GetOpDesc()->GetType() == VARIABLE) {
       Status ret = DealVariableNode(node);
       if (ret != SUCCESS) {
         GELOGE(ret, "variable add back edge failed");
@@ -149,7 +150,7 @@ NodePtr VariablePrepareOpPass::GetFinalWritableNode(ge::NodePtr &writable_node,
       }
     }
     if (!found_writeable_node) {
-      GELOGI("final writable node is %s", current_node->GetName().c_str());
+      GELOGD("final writable node is %s", current_node->GetName().c_str());
       return current_node;
     }
   }
@@ -159,53 +160,54 @@ Status VariablePrepareOpPass::AddVariableRef(ge::NodePtr &final_writable_node, g
   GE_CHECK_NOTNULL(final_writable_node);
   GE_CHECK_NOTNULL(var_node);
 
-  NodePtr var_ref_node = CreatVariableRef(final_writable_node, var_node);
-  GE_CHECK_NOTNULL(var_ref_node);
-  // add  control anchor between var_ref_node and final peer node
-  // var_ref_node need to execute before other nodes
+  if (final_writable_node->GetType() == FRAMEWORKOP) {
+    GELOGD("No need to add variable_ref for frameworkop");
+    return SUCCESS;
+  }
+  std::stringstream variable_ref_name;
+  variable_ref_name << "_TO_" << final_writable_node->GetName() << "_REF_" << index;
+  ge::NodePtr find_node = var_node->GetOwnerComputeGraph()->FindNode(var_node->GetName() + variable_ref_name.str());
+  if (find_node != nullptr) {
+    GELOGD("The corresponding variable_ref [%s] has been added to this connection.", find_node->GetName().c_str());
+    return SUCCESS;
+  }
+  NodePtr variable_ref_node = CreatVariableRef(var_node->GetName() + variable_ref_name.str(), var_node);
+
+  GELOGI("Add variable_ref between [%s] and [%s]", var_node->GetName().c_str(), variable_ref_node->GetName().c_str());
+  GE_CHECK_NOTNULL(variable_ref_node);
+  // add  control anchor between  variable_ref and final peer node
+  // variable_ref_node need to execute before other nodes
   auto final_writable_outAnchors = final_writable_node->GetAllOutAnchors();
   for (auto &final_writable_outAnchor : final_writable_outAnchors) {
     GE_CHECK_NOTNULL(final_writable_outAnchor);
     for (auto &final_writable_peerAnchor : final_writable_outAnchor->GetPeerAnchors()) {
       GE_CHECK_NOTNULL(final_writable_peerAnchor);
       NodePtr peer_node = final_writable_peerAnchor->GetOwnerNode();
-      graphStatus ret = ge::GraphUtils::AddEdge(var_ref_node->GetOutControlAnchor(), peer_node->GetInControlAnchor());
+      graphStatus ret =
+        ge::GraphUtils::AddEdge(variable_ref_node->GetOutControlAnchor(), peer_node->GetInControlAnchor());
       if (ret != GRAPH_SUCCESS) {
-        GELOGE(FAILED, "add  control anchor between var_ref_node and final_writable peer_node failed");
+        GELOGE(FAILED, "add control anchor between  variable_ref and final_writable peer node failed");
         return FAILED;
       }
     }
   }
-  // add edge final node:index ---> var_ref_node:0
   graphStatus ret =
-    ge::GraphUtils::AddEdge(final_writable_node->GetOutDataAnchor(index), var_ref_node->GetInDataAnchor(0));
+    ge::GraphUtils::AddEdge(final_writable_node->GetOutDataAnchor(index), variable_ref_node->GetInDataAnchor(0));
   if (ret != GRAPH_SUCCESS) {
-    GELOGE(FAILED, "add  data anchor between var_ref_node and final_writable peer_node failed");
+    GELOGE(FAILED, "add data anchor between  variable_ref and final_writable peer node failed");
     return FAILED;
   }
   return SUCCESS;
 }
 
-ge::NodePtr VariablePrepareOpPass::CreatVariableRef(ge::NodePtr &final_writable_node, ge::NodePtr &var_node) {
-  if ((final_writable_node == nullptr) || (var_node == nullptr) || (var_node->GetOwnerComputeGraph() == nullptr)) {
-    GELOGE(FAILED, "parameter ptr is null.");
-    return nullptr;
-  }
-  GELOGD("Create VarRef Op: final_writable_node: [%s] var_node: [%s]>>>>", final_writable_node->GetName().c_str(),
-         var_node->GetName().c_str());
-
-  static uint32_t var_ref_count = 0;
-  std::stringstream var_ref_name;
-  var_ref_name << "_to_" << final_writable_node->GetName() << "_REF_" << var_ref_count++;
-
+ge::NodePtr VariablePrepareOpPass::CreatVariableRef(const std::string &variable_ref_name, ge::NodePtr &var_node) {
   OpDescPtr var_op_desc = var_node->GetOpDesc();
   if (var_op_desc == nullptr) {
     GELOGE(FAILED, "get var opdesc is nullptr");
     return nullptr;
   }
 
-  OpDescPtr var_ref_op_desc =
-    MakeShared<OpDesc>(var_node->GetName() + var_ref_name.str().c_str(), var_op_desc->GetType());
+  OpDescPtr var_ref_op_desc = MakeShared<OpDesc>(variable_ref_name.c_str(), var_op_desc->GetType());
   if (var_ref_op_desc == nullptr) {
     GELOGE(FAILED, "var_ref opdesc is nullptr");
     return nullptr;
@@ -217,15 +219,15 @@ ge::NodePtr VariablePrepareOpPass::CreatVariableRef(ge::NodePtr &final_writable_
   GE_IF_BOOL_EXEC(var_ref_op_desc->AddInputDesc(var_op_desc->GetOutputDesc(0)) != SUCCESS,
                   GELOGW("add input desc edge failed");
                   return nullptr);
-  NodePtr var_ref_node = var_node->GetOwnerComputeGraph()->AddNode(var_ref_op_desc);
-  GE_IF_BOOL_EXEC(var_ref_node == nullptr, GELOGW("var_ref_node is null"); return nullptr);
-  has_dealed_variable_.insert(var_node->GetName());
+  NodePtr variable_ref_node = var_node->GetOwnerComputeGraph()->AddNode(var_ref_op_desc);
+  GE_IF_BOOL_EXEC(variable_ref_node == nullptr, GELOGW("variable_ref_node is null"); return nullptr);
 
   bool is_set_str = ge::AttrUtils::SetStr(var_ref_op_desc, REF_VAR_SRC_VAR_NAME, var_op_desc->GetName());
   if (is_set_str) {
-    GELOGD("Set node [%s] REF_VAR_SRC_VAR_NAME [%s]", var_ref_node->GetName().c_str(), var_op_desc->GetName().c_str());
+    GELOGD("Set node [%s] REF_VAR_SRC_VAR_NAME [%s]", variable_ref_node->GetName().c_str(),
+           var_op_desc->GetName().c_str());
   }
-  return var_ref_node;
+  return variable_ref_node;
 }
 
 int VariablePrepareOpPass::GetWritableNodeOutIndex(const NodePtr &node, int input_index) {
@@ -240,16 +242,13 @@ int VariablePrepareOpPass::GetWritableNodeOutIndex(const NodePtr &node, int inpu
     }
   }
 
-  auto node_iter = ref_input_output_map_.find(node_type);
-  if (node_iter == ref_input_output_map_.end()) {
-    return -1;
-  }
-
-  auto index_iter = node_iter->second.find(input_index);
-  if (index_iter == node_iter->second.end()) {
-    return -1;
+  if (node_type == FRAMEWORKOP) {
+    std::string original_type;
+    GE_IF_BOOL_EXEC(GetOriginalType(node, original_type) != SUCCESS, GELOGW("Get node original type fail"));
+    GELOGI("find frameworkop: [%s], original type is %s", node->GetName().c_str(), original_type.c_str());
+    return FindRefOutIndex(original_type, input_index, ref_node_without_prototype_map_);
   }
-  return index_iter->second;
+  return FindRefOutIndex(node_type, input_index, ref_input_output_map_);
 }
 
 void VariablePrepareOpPass::GenerateRefTypeAndInputOutputMap(const NodePtr &node) {
@@ -301,4 +300,18 @@ Status VariablePrepareOpPass::UpdateAssignOpDesc(const ge::NodePtr &node) {
   }
   return SUCCESS;
 }
+
+int VariablePrepareOpPass::FindRefOutIndex(const std::string &node_type, int input_index,
+                                           const std::map<std::string, std::map<int, int>> &ref_map) {
+  auto node_iter = ref_map.find(node_type);
+  if (node_iter == ref_map.end()) {
+    return -1;
+  }
+
+  auto index_iter = node_iter->second.find(input_index);
+  if (index_iter == node_iter->second.end()) {
+    return -1;
+  }
+  return index_iter->second;
+}
 }  // namespace ge
diff --git a/src/ge/graph/passes/variable_prepare_op_pass.h b/src/ge/graph/passes/variable_prepare_op_pass.h
index 0fbd311c..fb25d5db 100644
--- a/src/ge/graph/passes/variable_prepare_op_pass.h
+++ b/src/ge/graph/passes/variable_prepare_op_pass.h
@@ -33,13 +33,15 @@ class VariablePrepareOpPass : public GraphPass {
   Status DealWritableNode(ge::NodePtr &writable_node, ge::NodePtr &var_node, int out_index);
   NodePtr GetFinalWritableNode(ge::NodePtr &writable_node, int &out_index);
   Status AddVariableRef(ge::NodePtr &node, ge::NodePtr &var_node, int index);
-  NodePtr CreatVariableRef(ge::NodePtr &final_ref_type_node, ge::NodePtr &var_node);
+  NodePtr CreatVariableRef(const std::string &variable_ref_name, ge::NodePtr &var_node);
   int GetWritableNodeOutIndex(const NodePtr &node, int input_index);
   Status UpdateAssignOpDesc(const ge::NodePtr &node);
   void GenerateRefTypeAndInputOutputMap(const NodePtr &node);
+  int FindRefOutIndex(const std::string &node_type, int input_index,
+                      const std::map<std::string, std::map<int, int>> &ref_map);
 
   std::map<std::string, std::map<int, int>> ref_input_output_map_;
-  std::unordered_set<string> has_dealed_variable_{};
+  static std::map<std::string, std::map<int, int>> ref_node_without_prototype_map_;
 };
 }  // namespace ge
 
diff --git a/src/ge/graph/preprocess/graph_preprocess.cc b/src/ge/graph/preprocess/graph_preprocess.cc
index a33bc8cc..eacec6d1 100644
--- a/src/ge/graph/preprocess/graph_preprocess.cc
+++ b/src/ge/graph/preprocess/graph_preprocess.cc
@@ -736,6 +736,35 @@ Status ProcessNetoutputNode(NodePtr &node, std::string &output_type) {
   }
   return SUCCESS;
 }
+
+Status CheckIfNeedSetNdFormat(const NodePtr &node_ptr) {
+  auto op = node_ptr->GetOpDesc();
+  GE_CHECK_NOTNULL(op);
+  auto inputDescsPtr = op->GetAllInputsDescPtr();
+  auto outputDescsPtr = op->GetAllOutputsDescPtr();
+  ge::Format format = ge::FORMAT_ND;
+  // if user set shape larger than 4, inferformat may set NCHW or NHWC, GE should set ND before FE
+  // process, otherwise fe will insert transdata.
+  for (auto &inputDescPtr : inputDescsPtr) {
+    GE_CHECK_NOTNULL(inputDescPtr);
+    if ((inputDescPtr->GetShape().GetDims().size() > ge::DIM_DEFAULT_SIZE) &&
+        ((inputDescPtr->GetFormat() == ge::FORMAT_NCHW) || (inputDescPtr->GetFormat() == ge::FORMAT_NHWC))) {
+      GELOGI("The node inputdesc [%s] format need to be set ND", op->GetName().c_str());
+      inputDescPtr->SetFormat(format);
+      inputDescPtr->SetOriginFormat(format);
+    }
+  }
+  for (auto &outputDescPtr : outputDescsPtr) {
+    GE_CHECK_NOTNULL(outputDescPtr);
+    if ((outputDescPtr->GetShape().GetDims().size() > ge::DIM_DEFAULT_SIZE) &&
+        ((outputDescPtr->GetFormat() == ge::FORMAT_NCHW) || (outputDescPtr->GetFormat() == ge::FORMAT_NHWC))) {
+      GELOGI("The node outputdesc [%s] format need to be set ND", op->GetName().c_str());
+      outputDescPtr->SetFormat(format);
+      outputDescPtr->SetOriginFormat(format);
+    }
+  }
+  return SUCCESS;
+}
 }  // namespace
 
 GraphPrepare::GraphPrepare() : compute_graph_(nullptr) {}
@@ -826,9 +855,12 @@ Status GraphPrepare::CheckGraph() {
 
 Status GraphPrepare::CheckRefInputNode(const NodePtr &node, const std::string &input_name,
                                        const std::unordered_set<NodePtr> &ref_nodes) {
+  // Acceptable input types should be ref node, variable or Switch operator, which is issued by ME for dynamic
+  // lossscale and would be optimized in SwitchOpPass. Since ME dont differentiate between RefSwitch and Switch,
+  // and only issue Switch.
   static std::unordered_set<std::string> acceptable_types = {ge::VARIABLE,         ge::VARIABLEV2, ge::VARHANDLEOP,
                                                              ge::REFSWITCH,        ge::REFMERGE,   ge::REFENTER,
-                                                             ge::REFNEXTITERATION, ge::REFEXIT};
+                                                             ge::REFNEXTITERATION, ge::REFEXIT,    ge::SWITCH};
   GE_CHECK_NOTNULL(node);
   const auto &op_desc = node->GetOpDesc();
   GE_CHECK_NOTNULL(op_desc);
@@ -972,7 +1004,7 @@ Status GraphPrepare::UpdateInput(const std::vector<GeTensor> &user_input) {
       int64_t desc_shape = desc.GetShape().GetShapeSize();
       FMK_INT64_UINT32_MULCHECK(desc_shape, length);
       int64_t shape_size = desc_shape * length;
-      GE_IF_BOOL_EXEC(shape_size == 0, shape_size = static_cast<int64_t>(length));
+      GE_IF_BOOL_EXEC(shape_size == 0 && desc.GetShape().GetDimNum() == 0, shape_size = static_cast<int64_t>(length));
       int64_t size = 0;
       GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(desc, size) != GRAPH_SUCCESS,
                       GELOGE(INTERNAL_ERROR, "TensorUtils GetSize failed");
@@ -1106,6 +1138,10 @@ Status GraphPrepare::OptimizeAfterInfershapeByAtcParams() {
   GE_RETURN_IF_ERROR(InsertNewOpUtil::Instance().UpdateDataNodeByAipp(compute_graph_));
   for (auto &node_ptr : compute_graph_->GetDirectNode()) {
     GE_CHECK_NOTNULL(node_ptr);
+    if (CheckIfNeedSetNdFormat(node_ptr) != SUCCESS) {
+      GELOGE(INTERNAL_ERROR, "Set node [%s] format ND failed", node_ptr->GetName().c_str());
+      return FAILED;
+    }
     if (node_ptr->GetType() == DATA) {
       if (ProcessDataNode(node_ptr) != SUCCESS) {
         GELOGE(INTERNAL_ERROR, "Process data node failed");
@@ -1416,9 +1452,17 @@ Status GraphPrepare::VerifyConstOp(const NodePtr &node) {
   FMK_INT64_UINT32_MULCHECK(shape_size, length);
   GELOGI("Const real value Size:%zu, op_desc Shape Size:%ld, data_type:%s.", data_size, shape_size * length,
          TypeUtils::DataTypeToSerialString(data_type).c_str());
-  if ((shape_size != 0) || (data_size / length != 1)) {
-    GE_CHK_BOOL_EXEC(data_size == static_cast<size_t>(shape_size * length) && data_size != 0,
-                     return GRAPH_PARAM_INVALID, "Const input data size is not equal with tensor desc shape");
+  if (shape_size == 0) {
+    if (ge_tensor_desc.GetShape().GetDims().size() == 0) {
+      // shape = [], means it's a sclar tensor.
+      GE_CHK_BOOL_EXEC(data_size / length == 1, return PARAM_INVALID, "Const is invalid scalar tensor.");
+    } else {
+      // shape = [x, y, 0,...], means it's a vector tensor that value is [].
+      GE_CHK_BOOL_EXEC(data_size == 0, return PARAM_INVALID, "Const is invalid vector scalar.");
+    }
+  } else {
+    GE_CHK_BOOL_EXEC(data_size == static_cast<size_t>(shape_size * length) && data_size != 0, return PARAM_INVALID,
+                     "Const input data size is not equal with tensor desc shape");
   }
   return SUCCESS;
 }
@@ -1448,8 +1492,8 @@ Status GraphPrepare::CheckUserInput(const std::vector<GeTensor> &user_input) {
       GeTensorDesc desc(user_input[index].GetTensorDesc());
 
       for (size_t i = 0; i < desc.GetShape().GetDimNum(); ++i) {
-        if (desc.GetShape().GetDim(i) <= 0) {
-          GELOGE(GE_GRAPH_INIT_FAILED, "data dim %zu is not supported, need > 0, real:%ld.", i,
+        if (desc.GetShape().GetDim(i) < 0) {
+          GELOGE(GE_GRAPH_INIT_FAILED, "data dim %zu is not supported, need >= 0, real:%ld.", i,
                  desc.GetShape().GetDim(i));
           return GE_GRAPH_INIT_FAILED;
         }
@@ -1472,8 +1516,6 @@ Status GraphPrepare::InferShapeForPreprocess() {
   }
   InferShapePass infer_shape_pass;
   names_to_passes.emplace_back("InferShapePass", &infer_shape_pass);
-  ReplaceWithEmptyConstPass replace_with_empty_const_pass;
-  names_to_passes.emplace_back("ReplaceWithEmptyConstPass", &replace_with_empty_const_pass);
   DimensionComputePass dimension_compute_pass;
   names_to_passes.emplace_back("DimensionComputePass", &dimension_compute_pass);
   ConstantFoldingPass constant_folding_pass;
diff --git a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
index fbdcc217..680e40c9 100644
--- a/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
+++ b/src/ge/graph/preprocess/insert_op/util_insert_aipp_op.cc
@@ -124,22 +124,21 @@ Status InsertNewOpUtil::CheckGraph(const ComputeGraphPtr &graph) {
     if (node->GetType() != DATA) {
       continue;
     }
-
+    size_t next_nodes_cnt = 0;
     std::vector<NodePtr> aippNodes;
     for (const auto &anchor : node->GetAllOutDataAnchors()) {
       for (const auto &inAnchor : anchor->GetPeerInDataAnchors()) {
         const std::string &nodeType = inAnchor->GetOwnerNode()->GetType();
-
-        GE_CHK_BOOL_RET_STATUS(aippNodes.size() == 0 || nodeType == AIPP, PARAM_INVALID,
-                               "Can not config part of outputs of Data node to support AIPP, config all of the "
-                               "outputs of Data to support AIPP, or config none of them");
-
+        next_nodes_cnt++;
         if (nodeType == AIPP) {
           aippNodes.push_back(inAnchor->GetOwnerNode());
           continue;
         }
       }
     }
+    GE_CHK_BOOL_RET_STATUS((aippNodes.size() == 0) || (aippNodes.size() == next_nodes_cnt), PARAM_INVALID,
+                           "Can not config part of outputs of Data node to support AIPP, config all "
+                           "of the outputs of Data to support AIPP, or config none of them");
 
     std::unique_ptr<domi::AippOpParams> aippParams(new (std::nothrow) domi::AippOpParams());
     GE_CHECK_NOTNULL(aippParams);
diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.cc b/src/ge/graph/preprocess/multi_batch_copy_graph.cc
index b93b02f9..523c41cb 100644
--- a/src/ge/graph/preprocess/multi_batch_copy_graph.cc
+++ b/src/ge/graph/preprocess/multi_batch_copy_graph.cc
@@ -142,7 +142,7 @@ Status CalcShape(const std::vector<int64_t> &batch_shape, GeShape &data_shape) {
 
 bool IsAllDimsPositive(const std::vector<int64_t> &dims) {
   for (auto dim : dims) {
-    if (dim <= 0) {
+    if (dim < 0) {
       return false;
     }
   }
diff --git a/src/ge/init/gelib.cc b/src/ge/init/gelib.cc
index a9ef96c1..1b449521 100644
--- a/src/ge/init/gelib.cc
+++ b/src/ge/init/gelib.cc
@@ -16,8 +16,8 @@
 
 #include "init/gelib.h"
 
-#include <cstdlib>
 #include <dlfcn.h>
+#include <cstdlib>
 #include <mutex>
 #include <set>
 #include <sstream>
@@ -142,6 +142,35 @@ Status GELib::InnerInitialize(const map<string, string> &options) {
   return SUCCESS;
 }
 
+void GELib::SetIncreBuild(const map<string, string> &options) {
+  auto iter = options.find(OPTION_EXEC_ENABLE_INCRE_BUILD);
+  if (iter != options.end()) {
+    const std::string enable_incre_build = "true";
+    const std::string disable_incre_build = "false";
+    if (iter->second == enable_incre_build) {
+      is_incre_build_ = true;
+      GELOGI("Enable incre build.");
+      auto path_iter = options.find(OPTION_EXEC_INCRE_BUILD_CACHE_PATH);
+      if (path_iter != options.end()) {
+        std::string cache_path = path_iter->second;
+        if (!cache_path.empty() && cache_path[cache_path.size() - 1] != '/') {
+          cache_path += "/";
+        }
+        incre_build_cache_path_ = cache_path;
+      } else {
+        incre_build_cache_path_ = ".ge_cache/";
+      }
+      GELOGD("Using incre build cache path: %s.", incre_build_cache_path_.c_str());
+    } else if (iter->second == disable_incre_build) {
+      is_incre_build_ = false;
+      GELOGI("Disable incre build.");
+    } else {
+      is_incre_build_ = false;
+      GELOGW("Invalid ENABLE_INCRE_BUILD option, it should be true or false.");
+    }
+  }
+}
+
 Status GELib::SystemInitialize(const map<string, string> &options) {
   Status status = FAILED;
   auto iter = options.find(OPTION_GRAPH_RUN_MODE);
@@ -174,6 +203,8 @@ Status GELib::SystemInitialize(const map<string, string> &options) {
       PropertiesManager::Instance().SetDumpStep(dump_step);
     }
   }
+  // check incre build flag
+  SetIncreBuild(options);
 
   if (is_train_mode_) {
     InitOptions(options);
@@ -258,8 +289,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt
   GE_LOGE_IF(ret != SUCCESS, "write job state failed, ret:%u", ret);
   options.physical_device_id = options.device_id;
 
-  // The physical ID is transferred to the logical ID. FMK receives physical ID
-  // and needs to be converted
+  // The physical ID is transferred to the logical ID. FMK receives physical ID and needs to be converted
   uint32_t dev_logic_index = 0;
   rtError_t rt_ret = rtGetDeviceIndexByPhyId(static_cast<uint32_t>(options.device_id), &dev_logic_index);
   GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE,
@@ -273,8 +303,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithOpt
 
   GE_CHK_RT_RET(rtSetDevice(options.device_id));
 
-  // In the scenario that the automatic add fusion is set, but there is no
-  // cleanaddr operator, maybe need to check it
+  // In the scenario that the automatic add fusion is set, but there is no cleanaddr operator,
+  // maybe need to check it
   is_system_inited = true;
   is_shutdown = false;
 
@@ -287,10 +317,10 @@ Status GELib::SystemShutdownWithOptions(const Options &options) {
   GELOGI("Training finalize GELib begin.");
 
   std::lock_guard<std::mutex> lock(status_mutex_);
-  GE_IF_BOOL_EXEC(is_shutdown || !is_system_inited, GELOGW("System Shutdown with options is already is_shutdown "
-                                                           "or system does not inited. "
-                                                           "is_shutdown:%d is_omm_inited:%d",
-                                                           is_shutdown, is_system_inited);
+  GE_IF_BOOL_EXEC(is_shutdown || !is_system_inited,
+                  GELOGW("System Shutdown with options is already is_shutdown or system does not inited. "
+                         "is_shutdown:%d is_omm_inited:%d",
+                         is_shutdown, is_system_inited);
                   return SUCCESS);
 
   GE_CHK_RT(rtDeviceReset(options.device_id));
@@ -324,9 +354,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status GELib::InitSystemWithout
 
   static bool is_inited = false;
   if (is_inited) {
-    GELOGW(
-      "System init without options is already inited,  don't need to init "
-      "again.");
+    GELOGW("System init without options is already inited,  don't need to init again.");
     return SUCCESS;
   }
   is_inited = true;
diff --git a/src/ge/init/gelib.h b/src/ge/init/gelib.h
index 0945907a..3db32dd2 100644
--- a/src/ge/init/gelib.h
+++ b/src/ge/init/gelib.h
@@ -65,6 +65,12 @@ class GELib {
   // add head stream to model
   bool HeadStream() const { return head_stream_; }
 
+  // get incre build flag
+  bool IsIncreBuild() const { return is_incre_build_; }
+
+  // get incre build cache path
+  const std::string &GetIncreBuildCachePath() const { return incre_build_cache_path_; }
+
   Status InitSystemWithoutOptions();
   Status InitSystemWithOptions(Options &options);
   Status SystemShutdownWithOptions(const Options &options);
@@ -76,6 +82,7 @@ class GELib {
   Status SystemInitialize(const map<string, string> &options);
   void RollbackInit();
   void InitOptions(const map<string, string> &options);
+  void SetIncreBuild(const map<string, string> &options);
 
   DNNEngineManager engineManager_;
   OpsKernelManager opsManager_;
@@ -87,8 +94,9 @@ class GELib {
   bool is_system_inited = false;
   bool is_shutdown = false;
   bool is_use_hcom = false;
-
+  bool is_incre_build_ = false;
   bool head_stream_ = false;
+  std::string incre_build_cache_path_;
 };
 }  // namespace ge
 
diff --git a/src/ge/single_op/single_op_model.cc b/src/ge/single_op/single_op_model.cc
index 1cede863..f2d2da88 100644
--- a/src/ge/single_op/single_op_model.cc
+++ b/src/ge/single_op/single_op_model.cc
@@ -76,8 +76,11 @@ void SingleOpModel::ParseOpModelParams(ModelHelper &model_helper, SingleOpModelP
   param.base_addr = ret ? static_cast<uint64_t>(value) : 0;
   ret = ge::AttrUtils::GetInt(model, MODEL_ATTR_TASK_GEN_WEIGHT_ADDR, value);
   param.weight_addr = ret ? static_cast<uint64_t>(value) : 0;
+  ret = ge::AttrUtils::GetInt(model, ATTR_MODEL_CORE_TYPE, value);
+  param.core_type = ret ? value : 0;
 
-  GELOGI("ParseOpModelParams(), memory_size:%lu, weight_size:%lu.", param.memory_size, param.weight_size);
+  GELOGI("ParseOpModelParams(), memory_size:%lu, weight_size:%lu. core_type = %lu", param.memory_size,
+         param.weight_size, param.core_type);
 }
 
 Status SingleOpModel::InitModelMem(StreamResource &res) {
diff --git a/src/ge/single_op/single_op_model.h b/src/ge/single_op/single_op_model.h
index c8880b06..c1a63758 100644
--- a/src/ge/single_op/single_op_model.h
+++ b/src/ge/single_op/single_op_model.h
@@ -39,13 +39,12 @@ struct SingleOpModelParam {
   uint8_t *weight_base = nullptr;
 
   std::map<uintptr_t, int> addr_mapping_;
+  int64_t core_type = 0;
 };
 
 class SingleOpModel {
  public:
-  SingleOpModel(const std::string &model_name,
-                const void *model_data,
-                uint32_t model_size);
+  SingleOpModel(const std::string &model_name, const void *model_data, uint32_t model_size);
   ~SingleOpModel() = default;
 
   Status Init();
diff --git a/src/ge/single_op/task/tbe_task_builder.cc b/src/ge/single_op/task/tbe_task_builder.cc
index b8911d0c..c0f6877f 100644
--- a/src/ge/single_op/task/tbe_task_builder.cc
+++ b/src/ge/single_op/task/tbe_task_builder.cc
@@ -89,16 +89,17 @@ TbeTaskBuilder::TbeTaskBuilder(const std::string &model_name, const OpDescPtr &o
                                const domi::KernelDef &kernel_def)
     : op_desc_(op_desc), kernel_def_(kernel_def), stub_name_(model_name + "/" + op_desc->GetName() + "_tvmbin") {}
 
-Status TbeTaskBuilder::DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle) const {
+Status TbeTaskBuilder::DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle,
+                                        const SingleOpModelParam &param) const {
   rtDevBinary_t binary;
   binary.version = 0;
   binary.data = kernel_bin.GetBinData();
   binary.length = kernel_bin.GetBinDataSize();
-  binary.magic = RT_DEV_BINARY_MAGIC_ELF;
+  binary.magic = param.core_type == 0 ? RT_DEV_BINARY_MAGIC_ELF : RT_DEV_BINARY_MAGIC_ELF_AIVEC;
   auto ret = rtDevBinaryRegister(&binary, bin_handle);
   if (ret != RT_ERROR_NONE) {
-    GELOGE(RT_FAILED, "rtDevBinaryRegister failed, bin key = %s, rt ret = %d", stub_name_.c_str(),
-           static_cast<int>(ret));
+    GELOGE(RT_FAILED, "rtDevBinaryRegister failed, bin key = %s, core_type = %ld, rt ret = %d", stub_name_.c_str(),
+           param.core_type, static_cast<int>(ret));
     return RT_FAILED;
   }
 
@@ -132,13 +133,13 @@ Status TbeTaskBuilder::DoRegisterFunction(void *bin_handle, const char *stub_nam
   return SUCCESS;
 }
 
-Status TbeTaskBuilder::DoRegisterKernel(const ge::OpKernelBin &tbe_kernel, const char *bin_file_key,
-                                        void **bin_handle) {
+Status TbeTaskBuilder::DoRegisterKernel(const ge::OpKernelBin &tbe_kernel, const char *bin_file_key, void **bin_handle,
+                                        const SingleOpModelParam &param) {
   std::string kernel_name;
   GetKernelName(op_desc_, kernel_name);
 
   void *handle = nullptr;
-  auto ret = DoRegisterBinary(tbe_kernel, &handle);
+  auto ret = DoRegisterBinary(tbe_kernel, &handle, param);
   if (ret != SUCCESS) {
     return ret;
   }
@@ -160,7 +161,7 @@ Status TbeTaskBuilder::DoRegisterKernel(const ge::OpKernelBin &tbe_kernel, const
   return SUCCESS;
 }
 
-Status TbeTaskBuilder::RegisterKernel(TbeOpTask &task) {
+Status TbeTaskBuilder::RegisterKernel(TbeOpTask &task, const SingleOpModelParam &param) {
   KernelBinRegistry &registry = KernelBinRegistry::GetInstance();
   // check if already registered
   const char *stub_func = registry.GetStubFunc(stub_name_);
@@ -190,7 +191,7 @@ Status TbeTaskBuilder::RegisterKernel(TbeOpTask &task) {
     }
 
     void *bin_handle = nullptr;
-    auto ret = DoRegisterKernel(*tbe_kernel, stub_func, &bin_handle);
+    auto ret = DoRegisterKernel(*tbe_kernel, stub_func, &bin_handle, param);
     if (ret == SUCCESS) {
       holder->SetBinHandle(bin_handle);
       if (!registry.AddKernel(stub_name_, holder)) {
@@ -285,7 +286,7 @@ Status TbeTaskBuilder::BuildTask(TbeOpTask &task, const SingleOpModelParam &para
     return ret;
   }
 
-  ret = RegisterKernel(task);
+  ret = RegisterKernel(task, param);
   if (ret != SUCCESS) {
     return ret;
   }
diff --git a/src/ge/single_op/task/tbe_task_builder.h b/src/ge/single_op/task/tbe_task_builder.h
index 25441289..5e0965bf 100644
--- a/src/ge/single_op/task/tbe_task_builder.h
+++ b/src/ge/single_op/task/tbe_task_builder.h
@@ -74,9 +74,10 @@ class TbeTaskBuilder {
   Status SetKernelArgs(TbeOpTask &task, const SingleOpModelParam &param);
   Status GetSmDesc(void **sm_desc, const SingleOpModelParam &param) const;
 
-  Status RegisterKernel(TbeOpTask &task);
-  Status DoRegisterKernel(const OpKernelBin &kernel_bin, const char *bin_file_key, void **bin_handle);
-  Status DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle) const;
+  Status RegisterKernel(TbeOpTask &task, const SingleOpModelParam &param);
+  Status DoRegisterKernel(const OpKernelBin &kernel_bin, const char *bin_file_key, void **bin_handle,
+                          const SingleOpModelParam &param);
+  Status DoRegisterBinary(const OpKernelBin &kernel_bin, void **bin_handle, const SingleOpModelParam &param) const;
   Status DoRegisterMeta(void *bin_handle);
 
   static Status DoRegisterFunction(void *bin_handle, const char *stub_name, const char *kernel_name);
diff --git a/src/proto/fusion_model.proto b/src/proto/fusion_model.proto
index 2ff6b77a..6220963c 100644
--- a/src/proto/fusion_model.proto
+++ b/src/proto/fusion_model.proto
@@ -17,9 +17,10 @@
 syntax = "proto3";
 
 import "om.proto";
+
 package domi;
 
 message FusionModelDef {
     string version = 1;
     repeated OpDef fusion_op = 2;
-}
+}
\ No newline at end of file
diff --git a/src/proto/task.proto b/src/proto/task.proto
index 3eb8de5c..8ef5c2e2 100644
--- a/src/proto/task.proto
+++ b/src/proto/task.proto
@@ -31,7 +31,7 @@ message ModelTaskDef {
 
     repeated bytes op = 15; // input/output opdef in bytes
 
-    uint64 base_addr = 16;    // base addr 
+    uint64 base_addr = 16;    // base addr
     uint64 weight_addr = 17;  // weight addr
     uint32 batch_num = 18;
 }
@@ -58,6 +58,10 @@ message TaskDef {
     bytes private_def = 34;
     uint64 ops_kernel_store_ptr = 35;      // adjustments to other fields in the future
     StreamSwitchNDef stream_switch_n = 36;
+
+    LabelSetDef label_set = 37;
+    LabelGotoExDef label_goto_ex = 38;
+    LabelSwitchByIndexDef label_switch_by_index = 39;
 }
 
 message KernelDef {
@@ -119,6 +123,7 @@ message MemcpyAsyncDef {
     uint64 src = 3;
     uint64 count = 4;
     uint32 kind = 5;
+    uint32 op_index = 6;
 }
 
 message StreamSwitchDef {
@@ -142,3 +147,20 @@ message StreamSwitchNDef {
     uint32 element_size = 5;
     uint32 data_type = 6;
 }
+
+message LabelSetDef {
+    uint32 op_index = 1;
+    uint32 label_id = 2;
+    uint32 model_id = 3;
+}
+
+message LabelGotoExDef {
+    uint32 op_index = 1;
+    uint32 label_id = 2;
+    uint32 model_id = 3;
+}
+
+message LabelSwitchByIndexDef {
+    uint32 op_index = 1;
+    uint32 label_max = 2;
+}
diff --git a/tests/depends/omg/src/omg_stub.cc b/tests/depends/omg/src/omg_stub.cc
index 7197dac7..224d4128 100644
--- a/tests/depends/omg/src/omg_stub.cc
+++ b/tests/depends/omg/src/omg_stub.cc
@@ -122,6 +122,7 @@ struct OmFileContext {
 
 class SubGraphInfo;
 using SubGraphInfoPtr = std::shared_ptr<ge::SubGraphInfo>;
+using Graph2SubGraphInfoList = std::unordered_map<ComputeGraphPtr, std::vector<SubGraphInfoPtr>>;
 
 using GeModelPartitionPtr = std::shared_ptr<GeModelPartition>;
 using ModelPtr = std::shared_ptr<ge::Model>;
@@ -220,7 +221,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void OmFileSaveHelper::AddParti
 }
 class ModelBuilder {
  public:
-  ModelBuilder(ge::ComputeGraphPtr compute_graph, const std::vector<SubGraphInfoPtr> &subgraphs,
+  ModelBuilder(ge::ComputeGraphPtr compute_graph, const Graph2SubGraphInfoList &subgraphs,
                const std::map<std::string, int> &stream_max_parallel_num, bool hcom_parallel, int mode);
   virtual ~ModelBuilder();
   Status BuildModel(ge::Model &model_def);
@@ -235,7 +236,7 @@ class ModelBuilder {
   ge::Buffer weight_buffer_;
 };
 
-ModelBuilder::ModelBuilder(ge::ComputeGraphPtr compute_graph, const std::vector<SubGraphInfoPtr> &subgraphs,
+ModelBuilder::ModelBuilder(ge::ComputeGraphPtr compute_graph, const Graph2SubGraphInfoList &subgraphs,
                            const std::map<std::string, int> &stream_max_parallel_num, bool hcom_parallel, int mode) {
   weight_buffer_ = ge::Buffer(4100000);
 }
diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt
index 5ed130c7..fa94bab1 100755
--- a/tests/ut/ge/CMakeLists.txt
+++ b/tests/ut/ge/CMakeLists.txt
@@ -44,6 +44,7 @@ include_directories(${GE_SOURCE_DIR}/inc/graph)
 include_directories(${GE_SOURCE_DIR}/inc/framework)
 include_directories(${GE_SOURCE_DIR}/inc/common)
 include_directories(${GE_SOURCE_DIR}/third_party/securec/include)
+include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib)
 include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc)
 include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/cce)
 include_directories(${GE_SOURCE_DIR}/third_party/fwkacllib/inc/ops)
diff --git a/tests/ut/ge/common/datatype_transfer_unittest.cc b/tests/ut/ge/common/datatype_transfer_unittest.cc
index 5f11b272..e0f258a9 100644
--- a/tests/ut/ge/common/datatype_transfer_unittest.cc
+++ b/tests/ut/ge/common/datatype_transfer_unittest.cc
@@ -368,14 +368,20 @@ TEST_F(UtestDataTypeTransfer, invalid_src_data_type) {
   EXPECT_EQ(transfer.TransDataType(args, result), UNSUPPORTED);
 }
 
-TEST_F(UtestDataTypeTransfer, src_shape_empry) {
-  uint8_t data[1 * 4 * 4 * 1] = {0};
+TEST_F(UtestDataTypeTransfer, src_shape_empty) {
+  uint8_t data[1*4*4*1] = {0};
+  constexpr int64_t kShapeItemNumMAX = 1024UL * 1024UL * 1024UL * 1024UL;
 
   DataTypeTransfer transfer;
-  CastArgs args{reinterpret_cast<uint8_t *>(data), 0, DT_UINT8, DT_INT32};
+  CastArgs args {
+      reinterpret_cast<uint8_t *>(data),
+      0,
+      DT_UINT8,
+      DT_INT32
+  };
 
   TransResult result;
-  EXPECT_EQ(transfer.TransDataType(args, result), PARAM_INVALID);
+  EXPECT_EQ(transfer.TransDataType(args, result), SUCCESS);
 }
 
 TEST_F(UtestDataTypeTransfer, unsupprot_trans) {
diff --git a/tests/ut/ge/common/format_transfer_nhwc_5d_unittest.cc b/tests/ut/ge/common/format_transfer_nhwc_5d_unittest.cc
index 8d1ff256..b4beb6ce 100644
--- a/tests/ut/ge/common/format_transfer_nhwc_5d_unittest.cc
+++ b/tests/ut/ge/common/format_transfer_nhwc_5d_unittest.cc
@@ -701,7 +701,7 @@ TEST_F(UtestFormatTransferNhwc5d, invalid_src_shape2) {
   EXPECT_EQ(transfer.TransFormat(args, result), PARAM_INVALID);
   Status status =
       transfer.TransShape(args.src_format, args.src_shape, args.src_data_type, args.dst_format, args.dst_shape);
-  EXPECT_EQ(status, PARAM_INVALID);
+  EXPECT_EQ(status, SUCCESS);
 }
 
 TEST_F(UtestFormatTransferNhwc5d, invalid_src_format) {
diff --git a/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc b/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc
index f9799b49..e49005e8 100644
--- a/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc
+++ b/tests/ut/ge/graph/build/logical_stream_allocator_unittest.cc
@@ -21,6 +21,7 @@
 #define protected public
 #define private public
 #include "graph/manager/graph_manager_utils.h"
+#include "common/op/attr_value_util.h"
 #undef protected
 #undef private
 
@@ -189,18 +190,20 @@ class UtestLogicalStreamAllocator : public testing::Test {
   bool ExpectStreamEq(SubGraphInfoPtr subgraph, int64_t expect) { return GetStream(subgraph) == expect; }
 
   bool ExpectStreamNe(SubGraphInfoPtr subgraph, int64_t expect) { return GetStream(subgraph) != expect; }
-  Status AssignLogicalStreams(vector<SubGraphInfoPtr> subgraphs, vector<EngineConfPtr> &confs,
+  Status AssignLogicalStreams(Graph2SubGraphInfoList &subgraph_map, vector<EngineConfPtr> &confs,
                               std::map<std::string, int> &max_parallel_num, ComputeGraphPtr &whole_graph) {
     SchedulerConf scheduler_conf;
     if (confs.empty()) {
-      for (const auto &subgraph : subgraphs) {
-        EngineConfPtr conf = make_shared<EngineConf>();
-        conf->id = subgraph->GetEngineName();
-        if (conf->id == "ge_local") {
-          conf->skip_assign_stream = true;
-          conf->attach = true;
+      for (const auto &subgraph_pair : subgraph_map) {
+        for (const auto &subgraph : subgraph_pair.second) {
+          EngineConfPtr conf = make_shared<EngineConf>();
+          conf->id = subgraph->GetEngineName();
+          if (conf->id == "ge_local") {
+            conf->skip_assign_stream = true;
+            conf->attach = true;
+          }
+          scheduler_conf.cal_engines[conf->id] = conf;
         }
-        scheduler_conf.cal_engines[conf->id] = conf;
       }
     } else {
       for (auto &conf : confs) {
@@ -217,24 +220,33 @@ class UtestLogicalStreamAllocator : public testing::Test {
     scheduler_confs["scheduler"] = scheduler_conf;
     LogicalStreamAllocator allocator(scheduler_confs, max_parallel_num);
     int64_t stream_num = 0;
-    return allocator.Assign(whole_graph, subgraphs, stream_num);
+    return allocator.Assign(whole_graph, subgraph_map, stream_num);
   }
 
-  Status AssignLogicalStreams(vector<SubGraphInfoPtr> subgraphs, std::map<std::string, int> &max_parallel_num,
-                              vector<EngineConfPtr> &confs) {
-    ComputeGraphPtr whole_graph = make_shared<ComputeGraph>("whole_graph");
+  Status AssignLogicalStreams(vector<SubGraphInfoPtr> subgraphs,
+                              vector<EngineConfPtr> &confs,
+                              std::map<std::string, int> &max_parallel_num,
+                              ComputeGraphPtr &whole_graph) {
+    Graph2SubGraphInfoList subgraph_map;
+    subgraph_map[whole_graph] = subgraphs;
+    return AssignLogicalStreams(subgraph_map, confs, max_parallel_num, whole_graph);
+  }
+
+  Status AssignLogicalStreams(vector<SubGraphInfoPtr> subgraphs, vector<EngineConfPtr>& confs,
+                              std::map<std::string, int>& max_parallel_num) {
+    ComputeGraphPtr whole_graph = make_shared < ComputeGraph > ("whole_graph");
     return AssignLogicalStreams(subgraphs, confs, max_parallel_num, whole_graph);
   }
 
   Status AssignLogicalStreams(vector<SubGraphInfoPtr> subgraphs,
                               vector<EngineConfPtr> confs = vector<EngineConfPtr>()) {
     std::map<std::string, int> max_parallel_num;
-    return AssignLogicalStreams(subgraphs, max_parallel_num, confs);
+    return AssignLogicalStreams(subgraphs, confs, max_parallel_num);
   }
 
-  Status AssignLogicalStreams(vector<SubGraphInfoPtr> subgraphs, std::map<std::string, int> &max_parallel_num) {
-    vector<EngineConfPtr> confs;
-    return AssignLogicalStreams(subgraphs, max_parallel_num, confs);
+  Status AssignLogicalStreams(vector<SubGraphInfoPtr> subgraphs, std::map<std::string, int>& max_parallel_num) {
+    vector < EngineConfPtr > confs;
+    return AssignLogicalStreams(subgraphs, confs, max_parallel_num);
   }
 
   /// typical case
@@ -294,8 +306,8 @@ class UtestLogicalStreamAllocator : public testing::Test {
     max_parallel_num["aicpu"] = parallel_num;
 
     Status status = AssignLogicalStreams({const1, const2, get_next, genmask1, genmask2, domask, subgraph4, subgraph5,
-                                          subgraph6, allreduce1, allreduce2, apply1, apply2},
-                                         max_parallel_num, confs);
+                                          subgraph6, allreduce1, allreduce2, apply1, apply2}, confs,
+                                         max_parallel_num);
     EXPECT_EQ(status, ge::SUCCESS);
 
     EXPECT_EQ(GetStream(get_next), 0);
@@ -324,7 +336,7 @@ class UtestLogicalStreamAllocator : public testing::Test {
   ///                   E --> F(AllReduce) --- G
   ///  stream id:       2     2                2
   ///
-  void make_graph_with_allreduce(ge::ComputeGraphPtr graph) {
+  void MakeGraphWithAllreduce(ge::ComputeGraphPtr graph) {
     ge::OpDescPtr op_a = make_shared<ge::OpDesc>("A", DATA);
     auto desc_temp_ptr = make_shared<ge::GeTensorDesc>();
     auto desc_temp = *desc_temp_ptr;
@@ -337,6 +349,7 @@ class UtestLogicalStreamAllocator : public testing::Test {
 
     ge::OpDescPtr op_c = make_shared<ge::OpDesc>("C", "HcomAllReduce");
     op_c->AddInputDesc(desc_temp);
+    op_c->AddInputDesc(desc_temp);
     op_c->AddOutputDesc(desc_temp);
 
     ge::OpDescPtr op_d = make_shared<ge::OpDesc>("D", "testa");
@@ -349,12 +362,21 @@ class UtestLogicalStreamAllocator : public testing::Test {
 
     ge::OpDescPtr op_f = make_shared<ge::OpDesc>("F", "HcomAllReduce");
     op_f->AddInputDesc(desc_temp);
+    op_f->AddInputDesc(desc_temp);
     op_f->AddOutputDesc(desc_temp);
 
     ge::OpDescPtr op_g = make_shared<ge::OpDesc>("G", "testa");
     op_g->AddInputDesc(desc_temp);
     op_g->AddOutputDesc(desc_temp);
 
+    ge::OpDescPtr op_h = make_shared<ge::OpDesc>("H", "testa");
+    op_h->AddInputDesc(desc_temp);
+    op_h->AddOutputDesc(desc_temp);
+
+    ge::OpDescPtr op_i = make_shared<ge::OpDesc>("I", "testa");
+    op_i->AddInputDesc(desc_temp);
+    op_i->AddOutputDesc(desc_temp);
+
     // add node
     ge::NodePtr node_a = graph->AddNode(op_a);
     ge::NodePtr node_b = graph->AddNode(op_b);
@@ -363,14 +385,18 @@ class UtestLogicalStreamAllocator : public testing::Test {
     ge::NodePtr node_e = graph->AddNode(op_e);
     ge::NodePtr node_f = graph->AddNode(op_f);
     ge::NodePtr node_g = graph->AddNode(op_g);
+    ge::NodePtr node_h = graph->AddNode(op_h);
+    ge::NodePtr node_i = graph->AddNode(op_i);
 
     // add edge
-    ge::GraphUtils::AddEdge(node_a->GetOutDataAnchor(0), node_b->GetInDataAnchor(0));
-    ge::GraphUtils::AddEdge(node_a->GetOutDataAnchor(0), node_e->GetInDataAnchor(0));
-    ge::GraphUtils::AddEdge(node_b->GetOutDataAnchor(0), node_c->GetInDataAnchor(0));
-    ge::GraphUtils::AddEdge(node_c->GetOutDataAnchor(0), node_d->GetInDataAnchor(0));
-    ge::GraphUtils::AddEdge(node_e->GetOutDataAnchor(0), node_f->GetInDataAnchor(0));
-    ge::GraphUtils::AddEdge(node_f->GetOutDataAnchor(0), node_g->GetInDataAnchor(0));
+    node_a->GetOutDataAnchor(0)->LinkTo(node_b->GetInDataAnchor(0));
+    node_a->GetOutDataAnchor(0)->LinkTo(node_e->GetInDataAnchor(0));
+    node_b->GetOutDataAnchor(0)->LinkTo(node_c->GetInDataAnchor(0));
+    node_c->GetOutDataAnchor(0)->LinkTo(node_d->GetInDataAnchor(0));
+    node_e->GetOutDataAnchor(0)->LinkTo(node_f->GetInDataAnchor(0));
+    node_f->GetOutDataAnchor(0)->LinkTo(node_g->GetInDataAnchor(0));
+    node_h->GetOutDataAnchor(0)->LinkTo(node_c->GetInDataAnchor(1));
+    node_i->GetOutDataAnchor(0)->LinkTo(node_f->GetInDataAnchor(1));
 
     // add stream id
     node_a->GetOpDesc()->SetStreamId(0);
@@ -380,6 +406,14 @@ class UtestLogicalStreamAllocator : public testing::Test {
     node_e->GetOpDesc()->SetStreamId(2);
     node_f->GetOpDesc()->SetStreamId(2);
     node_g->GetOpDesc()->SetStreamId(2);
+
+    // add stream label
+    string stream_label1 = "1";
+    (void) AttrUtils::SetStr(node_c->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label1);
+    (void) AttrUtils::SetStr(node_d->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label1);
+    string stream_label2 = "2";
+    (void) AttrUtils::SetStr(node_f->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label2);
+    (void) AttrUtils::SetStr(node_g->GetOpDesc(), ATTR_NAME_STREAM_LABEL, stream_label2);
   }
 };
 
@@ -652,7 +686,7 @@ TEST_F(UtestLogicalStreamAllocator, test_independent) {
   vector<EngineConfPtr> confs = {conf1, conf2};
 
   Status status =
-      AssignLogicalStreams({subgraph1, subgraph2, subgraph3, subgraph4, subgraph5}, max_parallel_num, confs);
+      AssignLogicalStreams({subgraph1, subgraph2, subgraph3, subgraph4, subgraph5}, confs, max_parallel_num);
   EXPECT_EQ(status, ge::SUCCESS);
   EXPECT_EQ(GetStream(subgraph1), 0);
   EXPECT_EQ(GetStream(subgraph2), 0);
@@ -695,7 +729,7 @@ TEST_F(UtestLogicalStreamAllocator, test_independent_switch_label) {
   vector<EngineConfPtr> confs = {conf1, conf2, conf3};
 
   Status status =
-      AssignLogicalStreams({subgraph1, subgraph2, subgraph3, subgraph4, subgraph5}, max_parallel_num, confs);
+      AssignLogicalStreams({subgraph1, subgraph2, subgraph3, subgraph4, subgraph5}, confs, max_parallel_num);
   EXPECT_EQ(status, ge::SUCCESS);
   EXPECT_EQ(GetStream(subgraph1), 4);
   EXPECT_EQ(GetStream(subgraph2), 0);
@@ -833,9 +867,9 @@ TEST_F(UtestLogicalStreamAllocator, test_reassign_stream) {
   auto node1_1 = whole_graph->AddNode(node1->GetOpDesc());
   auto node1_2 = whole_graph->AddNode(node2->GetOpDesc());
   auto node1_3 = whole_graph->AddNode(node3->GetOpDesc());
-  GraphUtils::AddEdge(node1_1->GetOutControlAnchor(), node1_2->GetInControlAnchor());
-  GraphUtils::AddEdge(node1_2->GetOutDataAnchor(0), node1_3->GetInDataAnchor(0));
-  GraphUtils::AddEdge(node1->GetOutControlAnchor(), node2->GetInControlAnchor());
+  node1_1->GetOutControlAnchor()->LinkTo(node1_2->GetInControlAnchor());
+  node1_2->GetOutDataAnchor(0)->LinkTo(node1_3->GetInDataAnchor(0));
+  node1->GetOutControlAnchor()->LinkTo(node2->GetInControlAnchor());
 
   std::map<std::string, int> max_parallel_num;
   vector<SubGraphInfoPtr> subgraphs = {subgraph1, const2, subgraph3};
@@ -853,7 +887,7 @@ TEST_F(UtestLogicalStreamAllocator, test_all_reduce_parallel_pass) {
 
   ge::ComputeGraphPtr graph = make_shared<ge::ComputeGraph>("");
   graph->SetName("TestAllReduceParallelPass");
-  make_graph_with_allreduce(graph);
+  MakeGraphWithAllreduce(graph);
 
   std::map<std::string, int> max_parallel_num;
   LogicalStreamPass::Context context;
@@ -863,7 +897,13 @@ TEST_F(UtestLogicalStreamAllocator, test_all_reduce_parallel_pass) {
   LogicalStreamPassPtr allreduce_pass = std::make_shared<AllReduceParallelPass>();
   ret = allreduce_pass->Run(graph, subgraphs, context);
 
-  EXPECT_EQ(ret, NOT_CHANGED);
+  EXPECT_EQ(ret, SUCCESS);
+
+  ge::NodePtr node_d = graph->FindNode("D");
+  ge::NodePtr node_g = graph->FindNode("G");
+  int64_t stream_d = node_d->GetOpDesc()->GetStreamId();
+  int64_t stream_g = node_g->GetOpDesc()->GetStreamId();
+  EXPECT_EQ(stream_d + stream_g, 11);
 }
 
 }  // namespace ge
diff --git a/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc b/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc
index f8deff7f..a51299b3 100644
--- a/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc
+++ b/tests/ut/ge/graph/load/new_model_manager_davinci_model_unittest.cc
@@ -315,7 +315,7 @@ TEST_F(UtestModelManagerDavinciModel, success_GetInputOutputDescInfo_without_net
   auto node = compute_graph->AddNode(op_desc);
 
   model.data_op_list_.push_back(op_desc);
-  model.output_size_list_.push_back(32);
+  model.output_data_info_[0] = {32, (void *)0x70002010};
 
   model.op_list_[0] = op_desc;
 
@@ -419,7 +419,7 @@ TEST_F(UtestModelManagerDavinciModel, success_get_input_output_descInfo_with_net
   model.op_list_[0] = op_desc;
 
   model.output_op_list_.push_back(op_desc);
-  model.output_size_list_.push_back(32);
+  model.output_data_info_[0] = {32, (void *)0x70002010};
 
   vector<InputOutputDescInfo> input_shapes;
   vector<InputOutputDescInfo> output_shapes;
@@ -463,7 +463,7 @@ TEST_F(UtestModelManagerDavinciModel, success_get_input_output_desc_info_for_zer
   model.op_list_[0] = op_desc;
 
   model.output_op_list_.push_back(op_desc);
-  model.output_size_list_.push_back(32);
+  model.output_data_info_[0] = {32, (void *)0x70002010};
   model.output_memory_size_list_.push_back(64);
 
   vector<InputOutputDescInfo> input_shapes;
@@ -508,7 +508,7 @@ TEST_F(UtestModelManagerDavinciModel, success_get_input_output_desc_info_dim_siz
   model.op_list_[0] = op_desc;
 
   model.output_op_list_.push_back(op_desc);
-  model.output_size_list_.push_back(32);
+  model.output_data_info_[0] = {32, (void *)0x70002010};
 
   vector<InputOutputDescInfo> input_shapes;
   vector<InputOutputDescInfo> output_shapes;
@@ -1282,7 +1282,7 @@ TEST_F(UtestModelManagerDavinciModel, success_get_output_desc_info_with_netoutpu
   model.op_list_[0] = op_desc;
 
   model.output_op_list_.push_back(op_desc);
-  model.output_size_list_.push_back(32);
+  model.output_data_info_[0] = {32, (void *)0x70002010};
   model.output_memory_size_list_.push_back(64);
 
   vector<InputOutputDescInfo> output_shapes;
diff --git a/tests/ut/ge/graph/load/output_net_output_unittest.cc b/tests/ut/ge/graph/load/output_net_output_unittest.cc
index 52fdebfa..ca0eb871 100644
--- a/tests/ut/ge/graph/load/output_net_output_unittest.cc
+++ b/tests/ut/ge/graph/load/output_net_output_unittest.cc
@@ -131,25 +131,6 @@ TEST_F(UtestNetOutput, true_is_output) {
   delete model_utils;
 }
 
-// test ModelUtils::IsInputTensorNeedTrans
-TEST_F(UtestNetOutput, success_is_output_tensor_need_trans) {
-  ModelUtils *model_utils = new ModelUtils();
-  std::shared_ptr<OpDesc> op_desc = std::make_shared<OpDesc>();
-  OmeTestOpDescBuilder builder(op_desc);
-  builder.SetType("NetOutput");
-  size_t tensor_index = 1;
-  vector<GeTensorDescPtr> outputs_desc;
-  std::shared_ptr<GeTensorDesc> desc = std::make_shared<GeTensorDesc>();
-  outputs_desc.push_back(desc);
-  op_desc->outputs_desc_ = outputs_desc;
-  op_desc->inputs_desc_ = outputs_desc;
-
-  bool ret = model_utils->IsInputTensorNeedTrans(op_desc, tensor_index);
-  EXPECT_EQ(false, ret);
-
-  delete model_utils;
-}
-
 // test ModelUtils::GetOutputSize
 TEST_F(UtestNetOutput, success_get_output_size) {
   vector<int64_t> v_output_size;
diff --git a/third_party/fwkacllib/inc/mmpa/mmpa_api.h b/third_party/fwkacllib/inc/mmpa/mmpa_api.h
index ce1c9720..f1e30538 100644
--- a/third_party/fwkacllib/inc/mmpa/mmpa_api.h
+++ b/third_party/fwkacllib/inc/mmpa/mmpa_api.h
@@ -20,7 +20,7 @@
 #define  LINUX    0
 #define  WIN      1
 
-#if(OS_TYPE == LINUX) //lint !e553
+#if(OS_TYPE == LINUX)
 
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
@@ -84,7 +84,7 @@
 #endif
 
 
-#if(OS_TYPE == WIN) //lint !e553
+#if(OS_TYPE == WIN)
 #include <winsock2.h>
 #include <winsock.h>
 #include "Windows.h"
diff --git a/third_party/fwkacllib/inc/ops/all_ops.h b/third_party/fwkacllib/inc/ops/all_ops.h
index 36c991ff..37315c74 100644
--- a/third_party/fwkacllib/inc/ops/all_ops.h
+++ b/third_party/fwkacllib/inc/ops/all_ops.h
@@ -35,7 +35,6 @@
 #include "decode_wheels_target.h"
 #include "elewise_calculation_ops.h"
 #include "fastrcnn_predictions.h"
-#include "fsrdetectionoutput_ops.h"
 #include "functional_ops.h"
 #include "get_data_ops.h"
 #include "hcom_ops.h"
@@ -58,7 +57,6 @@
 #include "outfeed_ops.h"
 #include "pad_ops.h"
 #include "parsing_ops.h"
-#include "power_ops.h"
 #include "quantize_ops.h"
 #include "ragged_conversion_ops.h"
 #include "random_ops.h"
diff --git a/third_party/fwkacllib/inc/ops/array_ops.h b/third_party/fwkacllib/inc/ops/array_ops.h
index 74f8924a..7febad77 100644
--- a/third_party/fwkacllib/inc/ops/array_ops.h
+++ b/third_party/fwkacllib/inc/ops/array_ops.h
@@ -595,6 +595,9 @@ REG_OP(ExpandDims)
 
 *@par Outputs:
 *y: A tensor.
+
+*@par Attention:
+*This operator cannot be directly called by the acllopExecute API.
 */
 REG_OP(Reshape)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, DT_UINT8, DT_INT32,
@@ -848,7 +851,7 @@ REG_OP(Copy)
 `farmhash::fingerprint64`.
 
 *@par Outputs:
-y: A two-dimensional `Tensor` of type `tf.uint8`. The first dimension equals to \n
+y: A two-dimensional `Tensor` of type `uint8`. The first dimension equals to \n
 `data`'s first dimension, and the second dimension size depends on the \n
 fingerprint algorithm.
 
diff --git a/third_party/fwkacllib/inc/ops/data_flow_ops.h b/third_party/fwkacllib/inc/ops/data_flow_ops.h
index 0eaee06c..fee5e67d 100644
--- a/third_party/fwkacllib/inc/ops/data_flow_ops.h
+++ b/third_party/fwkacllib/inc/ops/data_flow_ops.h
@@ -259,7 +259,7 @@ match this name to the matching Unstage Op.
 REG_OP(Stage)
     .DYNAMIC_INPUT(values, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, \
         DT_INT16, DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \
-        DT_DOUBLE}))
+        DT_DOUBLE, DT_UINT32, DT_UINT64}))
     .ATTR(capacity, Int, 0)
     .ATTR(memory_limit, Int, 0)
     .ATTR(container, String, "")
@@ -312,7 +312,7 @@ REG_OP(StagePeek)
     .INPUT(index, TensorType({DT_INT32}))
     .DYNAMIC_OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \
                     DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \
-                    DT_DOUBLE}))
+                    DT_DOUBLE, DT_UINT32, DT_UINT64}))
     .ATTR(capacity, Int, 0)
     .ATTR(memory_limit, Int, 0)
     .ATTR(container, String, "")
@@ -363,7 +363,7 @@ REG_OP(StackPop)
     .INPUT(handle, TensorType({DT_RESOURCE}))
     .OUTPUT(element, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \
                      DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \
-                     DT_DOUBLE}))
+                     DT_DOUBLE, DT_UINT32, DT_UINT64}))
     .REQUIRED_ATTR(elem_type, Type)
     .OP_END_FACTORY_REG(StackPop)
 
@@ -388,10 +388,10 @@ REG_OP(StackPush)
     .INPUT(handle, TensorType({DT_RESOURCE}))
     .INPUT(element, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \
                      DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \
-                     DT_DOUBLE}))
+                     DT_DOUBLE, DT_UINT32, DT_UINT64}))
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \
                      DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \
-                     DT_DOUBLE}))
+                     DT_DOUBLE, DT_UINT32, DT_UINT64}))
     .ATTR(swap_memory, Bool, false)
     .OP_END_FACTORY_REG(StackPush)
 
@@ -540,7 +540,7 @@ REG_OP(ParallelDynamicStitch)
 *@par Attributes:An optional int that is >= 0. Defaults to "0".
 *@li capacity: An optional int that is >= 0. Defaults to "0".
 *@li memory_limit: An optional int that is >= 0. Defaults to "0".
-*@li dtypes: A list of tf.DTypes.
+*@li dtypes: A list of DTypes.
 *@li container: An optional string. Defaults to "".
 *@li shared_name: An optional string. Defaults to "".
 
@@ -563,7 +563,7 @@ REG_OP(MapClear)
 *@par Attributes:
 *@li capacity: An optional int that is >= 0. Defaults to "0".
 *@li memory_limit: An optional int that is >= 0. Defaults to "0".
-*@li dtypes: A list of tf.DTypes.
+*@li dtypes: A list of DTypes.
 *@li container: An optional string. Defaults to "".
 *@li shared_name: An optional string. Defaults to "".
 
@@ -602,7 +602,7 @@ REG_OP(MapIncompleteSize)
 REG_OP(Unstage)
     .DYNAMIC_OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT16, \
             DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, \
-            DT_DOUBLE}))
+            DT_DOUBLE, DT_UINT32, DT_UINT64}))
     .ATTR(capacity, Int, 0)
     .ATTR(memory_limit, Int, 0)
     .ATTR(container, String, "")
@@ -630,7 +630,7 @@ DT_QINT8, DT_QUINT8, DT_QINT16, DT_QUINT16, DT_QINT32.
 Maximum number of elements in the Staging Area. If > 0, \n
 inserts on the container will block when the capacity is reached.
 *@li memory_limit: An optional int that is >= 0. Defaults to "0".
-*@li dtypes: A list of tf.DTypes.
+*@li dtypes: A list of DTypes.
 *@li container: An optional string. Defaults to "". \n
 If non-empty, this queue is placed in the given container. \n
 Otherwise, a default container is used.
@@ -752,7 +752,7 @@ REG_OP(MapUnstageNoKey)
 *@par Attributes:
 *@li capacity: An optional int that is >= 0. Defaults to "0".
 *@li memory_limit: An optional int that is >= 0. Defaults to "0".
-*@li dtypes: A list of tf.DTypes that has length >= 1.
+*@li dtypes: A list of DTypes that has length >= 1.
 *@li container: An optional string. Defaults to "".
 *@li shared_name: An optional string. Defaults to "".
 
@@ -789,7 +789,7 @@ REG_OP(MapPeek)
 *@par Attributes:
 *@li capacity: An optional int that is >= 0. Defaults to "0".
 *@li memory_limit: An optional int that is >= 0. Defaults to "0".
-*@li dtypes: A list of tf.DTypes.
+*@li dtypes: A list of DTypes.
 *@li container: An optional string. Defaults to "".
 *@li shared_name: An optional string. Defaults to "".
 
@@ -1183,7 +1183,7 @@ REG_OP(PaddingFIFOQueue)
 *@brief A queue that produces elements sorted by the first component value.
 
 *@par Attributes:
-*@li component_types: An optional list of tf.DTypes. Defaults to {}. \n
+*@li component_types: An optional list of DTypes. Defaults to {}. \n
 The type of each component in a value.
 *@li shapes: A list of shapes for each component of a queue element.
 The length of this attr must be either 0 or the same as the length of \n
@@ -1451,7 +1451,7 @@ REG_OP(OrderedMapUnstageNoKey)
 *@par Attributes:
 *@li capacity: An optional int that is >= 0. Defaults to "0".
 *@li memory_limit: An optional int that is >= 0. Defaults to "0".
-*@li dtypes: A list of tf.DTypes that has length >= 1.
+*@li dtypes: A list of DTypes that has length >= 1.
 *@li container: An optional string. Defaults to "".
 *@li shared_name: An optional string. Defaults to "".
 
@@ -1876,7 +1876,7 @@ REG_OP(SparseAccumulatorApplyGradient)
     .INPUT(local_step, TensorType({DT_INT64}))
     .INPUT(indices, TensorType({DT_INT64}))
     .INPUT(values, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
-        DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT，DT_FLOAT16, DT_UINT32, \
+        DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_UINT32, \
         DT_UINT64, DT_COMPLEX64, DT_COMPLEX128, DT_QINT16, DT_QUINT16, \
         DT_QINT8, DT_QUINT8, DT_QINT32}))
     .INPUT(shape, TensorType({DT_INT64}))
diff --git a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
index d5272805..3eff2cbe 100644
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@@ -886,7 +886,10 @@ REG_OP(BesselI1e)
 * y: A Tensor of type UnaryDataType.
 
 * @attention Constraints:
-* @li base > 0 or if base is set to default (-1), base is set to e;
+* @li "base" is supposed to be greater than 0. Retaining the default \n
+* value "-1" sets "base" to "e".
+* @li If the input value of operator Log is within the range (0, 0.01] or \n
+* [0.95, 1.05], the output accuracy is subject to change.
 */
 REG_OP(Log)
     .INPUT(x, TensorType::UnaryDataType())
@@ -2056,6 +2059,7 @@ REG_OP(ArgMinWithValue)
 *    "0": product, "1": sum, "2": max.
 *@li coeff: A required attribute. Must met all of following rules:
 *    size of "coeff" must be equal to len("x") or is null.
+*    the absolute value of “coeff” must less than or equal to 1.
 */
 REG_OP(Eltwise)
     .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/fsrdetectionoutput_ops.h b/third_party/fwkacllib/inc/ops/fsrdetectionoutput_ops.h
deleted file mode 100644
index 2b3e206d..00000000
--- a/third_party/fwkacllib/inc/ops/fsrdetectionoutput_ops.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef GE_OP_FSRDETECTIONOUTPUT_OPS_H_
-#define GE_OP_FSRDETECTIONOUTPUT_OPS_H_
-#include "graph/operator_reg.h"
-
-namespace ge {
-/**
-*@brief Returns detection result.
-
-*@par Inputs:
-* Four inputs, including:
-*@li rois: An NCHW tensor of type floa16 or float32, output from operator proposal_d at the preceding layer, used as the input of operator FSRDetectionOutput.
-*@li prior_box: An NCHWC0 tensor of type floa16 or float32, specifying the prediction offset, used to update the coordinates [x1, y1, x2, y2] of each ROI.
-*@li score: An NCHWC0 tensor of type floa16 or float32, specifying the probability of each class. Class 0 is the background class.
-*@li actual_rois_num: An NCHW tensor of type int32, specifying the number of valid boxes per batch.
-*@par Attributes:
-*@li batch_rois: An optional int32, specifying the number of images to be predicted. Defaults to "1024". The value range is [1, 1024].
-*@li im_info: An optional list of two ints. Defaults to (375, 1024). The value range is [1, 1024].
-*@li num_classes: An optional int32, specifying the number of classes to be predicted. Defaults to "80". The value must be greater than 0.
-*@li max_rois_num: An optional int32, specifying the maximum number of ROIs per batch. Defaults to "1024". The value must be a multiple of 16.
-*@li score_thresh: An optional float32, specifying the threshold for box filtering. Defaults to 0.45. The value range is [0.0, 1.0].
-*@li nms_thresh: An optional float32, specifying the confidence threshold for box filtering, which is the output "obj" of operator Region. Defaults to 0.7. The value range is (0.0, 1.0).
-*@li bbox_reg_weights: An optional list of four ints. Defaults to (1, 1, 1, 1). Must not have value "0".
-*@li post_nms_topn: An optional int, specifying the number of output boxes. Defaults to "304". The value must be less than or equal to 1024 and must be a multiple of 16.
-*@li kernel_name: An optional string, specifying the operator name. Defaults to "fsr_detection_output".
-*@par Outputs:
-*box: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence.
-*actual_bbox_num: An NCHW tensor of type int32, specifying the number of output boxes.
-
-*@attention Constraints:\n
-*@li totalnum < max_rois_num * batch_rois.
-*@li "score" must be with shape (total_num, (num_classes+15)//16, 1, 1, 16), where "total_num" indicates the number of valid input boxes of all images.
-*@li "prior_box" must be with shape (total_num, (num_classes*4+15)//16, 1, 1, 16), where "total_num" indicates the number of valid input boxes of all images.
-*/
-REG_OP(FSRDetectionOutput)
-    .INPUT(rois, TensorType({DT_FLOAT, DT_FLOAT16}))
-    .INPUT(prior_box, TensorType({DT_FLOAT, DT_FLOAT16}))
-    .INPUT(score, TensorType({DT_FLOAT, DT_FLOAT16}))
-    .INPUT(actual_rois_num, TensorType({DT_INT32}))
-    .OUTPUT(actual_bbox_num, TensorType({DT_INT32}))
-    .OUTPUT(box, TensorType({DT_FLOAT, DT_FLOAT16}))
-    .ATTR(batch_rois, Int, 1024)
-    .ATTR(im_info, ListInt, {375,1024})
-    .ATTR(num_classes, Int, 80)
-    .ATTR(max_rois_num, Int, 1024)
-    .ATTR(score_thresh, Float, 0.45)
-    .ATTR(nms_thresh, Float, 0.7)
-    .ATTR(bbox_reg_weights, ListInt, {1,1,1,1})
-    .ATTR(post_nms_topn, Int, 304)
-    .OP_END_FACTORY_REG(FSRDetectionOutput)
-}
-#endif
diff --git a/third_party/fwkacllib/inc/ops/image_ops.h b/third_party/fwkacllib/inc/ops/image_ops.h
index 2ac7a70e..aaad03c6 100644
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -525,8 +525,7 @@ REG_OP(ResizeBilinearV2)
     .INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16,
                                DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
     .INPUT(size, TensorType({DT_INT32}))
-    .OUTPUT(y, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16,
-                           DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(y, TensorType({DT_FLOAT}))
     .ATTR(align_corners, Bool, false)
     .ATTR(half_pixel_centers, Bool, false)
     .OP_END_FACTORY_REG(ResizeBilinearV2)
@@ -925,7 +924,7 @@ images[3] <= 2048.
 */
 REG_OP(ResizeBilinearV2D)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
-    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT}))
     .ATTR(align_corners, Bool, false)
     .ATTR(half_pixel_centers, Bool, false)
     .REQUIRED_ATTR(size, ListInt)
diff --git a/third_party/fwkacllib/inc/ops/math_ops.h b/third_party/fwkacllib/inc/ops/math_ops.h
index aa318c94..cc97a337 100644
--- a/third_party/fwkacllib/inc/ops/math_ops.h
+++ b/third_party/fwkacllib/inc/ops/math_ops.h
@@ -23,6 +23,29 @@
 namespace ge {
 
 /**
+*@brief Computes the output as (shift + scale * x) ^ power.
+
+*@par Inputs:
+* x: A Tensor of type float16 or float32.
+
+*@par Attributes:
+*@li power: Optional. Defaults to 1.0.
+*@li scale: Optional. Defaults to 1.0.
+*@li shift: Optional. Defaults to 0.0.
+
+*@par Outputs:
+* y: A Tensor. Has the same type and shape as "x".
+*/
+
+REG_OP(Power)
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .ATTR(power, Float, 1.0)
+    .ATTR(scale, Float, 1.0)
+    .ATTR(shift, Float, 0.0)
+    .OP_END_FACTORY_REG(Power);
+
+/**
 *@brief Compute the lower regularized incomplete Gamma function P(a, x).
 
 *@par Inputs:
diff --git a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
index 597a8982..dd2ce56c 100644
--- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
@@ -492,7 +492,7 @@ REG_OP(DiagPart)
 *@brief Also known as a "fully-connected" layer, computes an inner product with a set of learned weights, and (optionally) adds biases.
 
 *@par Inputs:
-* Two inputs, including:
+* Four inputs, including:
 *@li x: A Tensor of type float16, int8.
 *@li w: A weight matrix of type float16, int8.
 *@li b: A Tensor of type float16, int32.
@@ -501,14 +501,13 @@ REG_OP(DiagPart)
 *@par Attributes:
 *@li num_output: Reserved.
 *@li transpose: A bool, specifying whether to transpose, either "true" or "false". Defaults to "false".
-*@li bias_term: A bool, specifying whether to learn and apply a set of additive biases to the filter outputs, either "true" or "false". Defaults to "true".
-*@li axis: only support axis is 1. Defaults to "1".
-*@li offset_a: A type of Int, Defaults to "1".
+*@li axis: Reserved.
+*@li offset_x: Reserved.
 
 *@par Outputs:
 *y: The result tensor of type float16, int8.
 */
-REG_OP(InnerProduct)
+REG_OP(FullyConnection)
     .INPUT(x, TensorType({DT_FLOAT16, DT_INT8}))
     .INPUT(w, TensorType({DT_FLOAT16, DT_INT8}))
     .OPTIONAL_INPUT(b, TensorType({DT_FLOAT16, DT_INT32}))
@@ -516,10 +515,9 @@ REG_OP(InnerProduct)
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32}))
     .REQUIRED_ATTR(num_output, Int)
     .ATTR(transpose, Bool, false)
-    .ATTR(bias_term, Bool, true)
     .ATTR(axis, Int, 1)
-    .ATTR(offset_a, Int, 0)
-    .OP_END_FACTORY_REG(InnerProduct)
+    .ATTR(offset_x, Int, 0)
+    .OP_END_FACTORY_REG(FullyConnection)
 
 /**
 *@brief Computes the confusion matrix from predictions and labels.
diff --git a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
index 1be85a0e..bc492e1b 100644
--- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
@@ -62,7 +62,7 @@ namespace ge {
 * data is 5D with shape [N, C1, Ho, Wo, C0],
 * where C is the same as that of the feature map and C0 is 16.\n
 * Limited by Tiling and L1 / L0 buffer memory: 512 * ceil(Wo, 16) + (480 *
-* stride_h + 32 * filter_h) * ceil(Wi, 16) �?l1_size and Hf*Wf �?l0b_size/512.\n
+* stride_h + 32 * filter_h) * ceil(Wi, 16) ≤ l1_size and Hf*Wf ≤ l0b_size/512.\n
 */
 REG_OP(DepthwiseConv2DBackpropFilter)
     .INPUT(input, TensorType({float16}))
@@ -115,7 +115,7 @@ REG_OP(DepthwiseConv2DBackpropFilter)
 * data is 5D with shape [N, C1, Ho, Wo, C0],
 * where C is the same as that of the feature map and C0 is 16.\n
 * Limited by Tiling and L1 / L0 buffer memory: 512 * ceil(Wo, 16) + (480 *
-* stride_h + 32 * filter_h) * ceil(Wi, 16) �?l1_size and Hf*Wf �?l0b_size/512.\n
+* stride_h + 32 * filter_h) * ceil(Wi, 16) ≤ l1_size and Hf*Wf ≤ l0b_size/512.\n
 */
 REG_OP(DepthwiseConv2DBackpropFilterD)
     .INPUT(input, TensorType({float16}))
@@ -170,7 +170,7 @@ REG_OP(DepthwiseConv2DBackpropFilterD)
 * Output backprop is 4D with shape [N, C, Ho, Wo] or [N, Ho, Wo, C], but the
 * data is 5D with shape [N, C1, Ho, Wo, C0],
 * where C is the same as that of the feature map and C0 is 16.\n
-* Limited by Tiling: max_h_in_l1 �?C0, where max_h_in_l1 = (l1_size - Hf *
+* Limited by Tiling: max_h_in_l1 ≥ C0, where max_h_in_l1 = (l1_size - Hf *
 * Wf * C0 * C0 * 2) / (2 * Wo *C0).\n
 */
 REG_OP(DepthwiseConv2DBackpropInput)
@@ -223,7 +223,7 @@ REG_OP(DepthwiseConv2DBackpropInput)
 * Output backprop is 4D with shape [N, C, Ho, Wo] or [N, Ho, Wo, C], but the
 * data is 5D with shape [N, C1, Ho, Wo, C0],
 * where C is the same as that of the feature map and C0 is 16.\n
-* Limited by Tiling: max_h_in_l1 �?C0, where max_h_in_l1 = (l1_size - Hf *
+* Limited by Tiling: max_h_in_l1 ≥ C0, where max_h_in_l1 = (l1_size - Hf *
 * Wf * C0 * C0 * 2) / (2 * Wo *C0).\n
 */
 REG_OP(DepthwiseConv2DBackpropInputD)
@@ -439,13 +439,17 @@ REG_OP(Conv2DBackpropInputD)
  * One optional input:
  * @li bias: An optional tensor of type int8
 *@par Attributes:
- * Three attributes:
+ * Five attributes:
  * @li strides: A tuple or list of 2 integers. The stride of the sliding window
  * for H/W dimension.
  * @li pads: A tuple or list of 4 integers. The [top, bottom, left, right]
  * padding on the feature map
  * @li dilations: A tuple or list of 4 integers. The dilation factor for each
  * dimension of input. Must be [1, 1, 1, 1].
+ * @li groups: Number of blocked connections from input channels to \n
+ output channels.
+ * @li data_format: An optional string from: "NHWC", "NCHW". Defaults to "NHWC".\n
+  Specify the data format of the input and output data.
 *@par Outputs:
  * y: A Tensor. Has the same type as "filter". 4D tensor with shape
  * [batch, height, width, channels] or [batch, channels, height, width].
@@ -458,6 +462,8 @@ REG_OP(Deconvolution)
     .ATTR(strides, ListInt, {1, 1, 1, 1})
     .ATTR(pads, ListInt, {0, 0, 0, 0})
     .ATTR(dilations, ListInt, {1, 1, 1, 1})
+    .ATTR(groups, Int, 1)
+    .ATTR(data_format, String, "NHWC")
     .OP_END_FACTORY_REG(Deconvolution)
 /**
 *@brief Computes the gradients of convolution with respect to the filter
@@ -631,7 +637,6 @@ REG_OP(Conv2D)
 *@par Attributes:
 *@li strides: A list of 5 ints. Specifies the stride of the sliding window for each dimension of "x". The N and C dimensions must be 1. Has the same format as "x".
 *@li pads: A list of 6 ints. Supports only padding along the D, H and W dimensions in sequence of head, tail, top, bottom, left and right.
-*@li padding_mode: An optional string from: "zeros", "circular". Defaults to "zeros".
 *@li data_format: An optional string from: "NDHWC", "NCDHW". Defaults to "NDHWC". Specify the data format of the input and output data.
 *@li dilations: A list of 5 ints. Specifies the dilation factor for each dimension of "x". The N and C dimensions must be 1. Has the same format as "x".
 
@@ -649,7 +654,6 @@ REG_OP(Conv3D)
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
     .ATTR(strides, ListInt, {1, 1, 1, 1, 1})
     .ATTR(pads, ListInt, {0, 0, 0, 0, 0, 0})
-    .ATTR(padding_mode, String, "zeros")
     .ATTR(data_format, String, "NDHWC")
     .ATTR(dilations, ListInt, {1, 1, 1, 1, 1})
     .OP_END_FACTORY_REG(Conv3D)
@@ -671,7 +675,7 @@ REG_OP(Conv3D)
  * @li data_format: An optional string from: "NDHWC", "NCHWD". Defaults to "NDHWC". Specify the data format of the input and output data.
 *@par Outputs:
  * y: A Tensor. Has the same type as filter,and has same format as input_size
-*/ 
+*/
 REG_OP(Conv3DBackpropInput)
     .INPUT(input_sizes, TensorType({DT_INT32, DT_INT64}))
     .INPUT(filters, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
@@ -698,7 +702,7 @@ REG_OP(Conv3DBackpropInput)
  * @li data_format: An optional string from: "NDHWC", "NCHWD". Defaults to "NDHWC". Specify the data format of the input and output data.
 *@par Outputs:
  * y: A Tensor. Has the same type as filter
-*/ 
+*/
 REG_OP(Conv3DBackpropInputD)
     .INPUT(filters, TensorType({DT_FLOAT16}))
     .INPUT(grads, TensorType({DT_FLOAT16}))
diff --git a/third_party/fwkacllib/inc/ops/nn_detect_ops.h b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
index 04cc3028..f1d6e420 100644
--- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
@@ -311,6 +311,357 @@ REG_OP(PSROIPooling)
     .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
     .OP_END_FACTORY_REG(PSROIPooling)
 
+/**
+*@brief Returns detection result.
+
+*@par Inputs:
+* Four inputs, including:
+*@li rois: An NCHW tensor of type floa16 or float32, output from operator proposal_d at the preceding layer, used as the input of operator FSRDetectionOutput.
+*@li prior_box: An NCHWC0 tensor of type floa16 or float32, specifying the prediction offset, used to update the coordinates [x1, y1, x2, y2] of each ROI.
+*@li score: An NCHWC0 tensor of type floa16 or float32, specifying the probability of each class. Class 0 is the background class.
+*@li actual_rois_num: An NCHW tensor of type int32, specifying the number of valid boxes per batch.
+*@par Attributes:
+*@li batch_rois: An optional int32, specifying the number of images to be predicted. Defaults to "1024". The value range is [1, 1024].
+*@li im_info: An optional list of two ints. Defaults to (375, 1024). The value range is [1, 1024].
+*@li num_classes: An optional int32, specifying the number of classes to be predicted. Defaults to "80". The value must be greater than 0.
+*@li max_rois_num: An optional int32, specifying the maximum number of ROIs per batch. Defaults to "1024". The value must be a multiple of 16.
+*@li score_thresh: An optional float32, specifying the threshold for box filtering. Defaults to 0.45. The value range is [0.0, 1.0].
+*@li nms_thresh: An optional float32, specifying the confidence threshold for box filtering, which is the output "obj" of operator Region. Defaults to 0.7. The value range is (0.0, 1.0).
+*@li bbox_reg_weights: An optional list of four ints. Defaults to (1, 1, 1, 1). Must not have value "0".
+*@li post_nms_topn: An optional int, specifying the number of output boxes. Defaults to "304". The value must be less than or equal to 1024 and must be a multiple of 16.
+*@li kernel_name: An optional string, specifying the operator name. Defaults to "fsr_detection_output".
+*@par Outputs:
+*box: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence.
+*actual_bbox_num: An NCHW tensor of type int32, specifying the number of output boxes.
+
+*@attention Constraints:\n
+*@li totalnum < max_rois_num * batch_rois.
+*@li "score" must be with shape (total_num, (num_classes+15)//16, 1, 1, 16), where "total_num" indicates the number of valid input boxes of all images.
+*@li "prior_box" must be with shape (total_num, (num_classes*4+15)//16, 1, 1, 16), where "total_num" indicates the number of valid input boxes of all images.
+*/
+REG_OP(FSRDetectionOutput)
+    .INPUT(rois, TensorType({DT_FLOAT, DT_FLOAT16}))
+    .INPUT(prior_box, TensorType({DT_FLOAT, DT_FLOAT16}))
+    .INPUT(score, TensorType({DT_FLOAT, DT_FLOAT16}))
+    .INPUT(actual_rois_num, TensorType({DT_INT32}))
+    .OUTPUT(actual_bbox_num, TensorType({DT_INT32}))
+    .OUTPUT(box, TensorType({DT_FLOAT, DT_FLOAT16}))
+    .ATTR(batch_rois, Int, 1024)
+    .ATTR(im_info, ListInt, {375,1024})
+    .ATTR(num_classes, Int, 80)
+    .ATTR(max_rois_num, Int, 1024)
+    .ATTR(score_thresh, Float, 0.45)
+    .ATTR(nms_thresh, Float, 0.7)
+    .ATTR(bbox_reg_weights, ListInt, {1,1,1,1})
+    .ATTR(post_nms_topn, Int, 304)
+    .OP_END_FACTORY_REG(FSRDetectionOutput)
+
+/**
+*@brief Normalizes data. It is called Region on YOLO v2 and Yolo on YOLO v3.
+
+*@par Inputs:
+*x: An NCHW tensor of type float16 or float32. The data is with shape (N, boxes*(coords+obj+classes), H, W),where, "obj" indicates the confidence of an object, and only one confidence is supported. Boxes are arranged as xx...xyy...yww...whh...hbb...bc0c0..c0c1c1...c1......cncn...cn.
+
+*@par Attributes:
+*@li boxes: A required int32, specifying the number of anchor boxes. Defaults to "5" for V2 or "3" for V3.
+*@li coords: An int32, specifying the number of parameters required for locating an object. The value is fixed at "4", corresponding to (x,y,w,h).
+*@li classes: An int32, specifying the number of prediction classes. Defaults to "80". The value range is [1, 1024].
+*@li yolo_version: A string, specifying the YOLO version, either "V2" or "V3".
+*@li softmax: A bool, specifying whether to perform softmax, valid only when "yolo_version = V2".
+*@li background: A bool, specifying the operation types of the obj and classes, used in conjunction with "softmax" and valid only when "yolo_version = V2".
+*@li background: A bool.
+
+*@par Outputs:
+*@li coord_data: A float16 or float32 with shape [N, boxes*coords, ceilx(height*width*2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the coordinates of a detected box.
+*@li obj_prob: A float16 or float32 with shape [N, ceilx(boxes*height*width *2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the confidence.
+*@li classes_prob: A float16 or float32 with shape [N, classes, ceilx(boxes*height*width *2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the prediction classes.
+
+*@attention Constraints:
+*@li This operator applies to YOLO v2 and v3 networks.
+*@li The succeeding layer of the Yolo operator must be operator Yolov3DetectionOutput.
+*/
+REG_OP(Yolo)
+    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .ATTR(boxes, Int, 3)
+    .ATTR(coords, Int, 4)
+    .ATTR(classes, Int, 80)
+    .ATTR(yolo_version, String, "V3")
+    .ATTR(softmax, Bool, false)
+    .ATTR(background, Bool, false)
+    .ATTR(softmaxtree, Bool, false)
+    .OP_END_FACTORY_REG(Yolo)
+
+/**
+*@brief Performs YOLO V2 detection.
+
+*@par Inputs:
+* Four inputs, including:
+*@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov3DetectionOutput. \n
+Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
+*@li imginfo: A float16, describing the image information including the required image height and width \n
+and the actual image height and width.
+*
+*@par Attributes:
+*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
+*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
+*@li coords: Specifies the number of coordinate parameters. Must be 4.
+*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80].
+*@li relative: An optional bool. Defaults to and must be "true".
+*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0].
+
+*@li post_nms_topn: An optional int32. This attribute is reserved.
+*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0].
+*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n
+*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024".
+*
+*@par Outputs:
+*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence.
+*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes.
+
+*@attention Constraints:\n
+*@li This operator applies only to the YOLO v2 network.
+*@li The preceding layer of operator Yolov2DetectionOutput must be one Yolo operator.
+
+*@see Yolo()
+*/
+REG_OP(YoloV2DetectionOutput)
+    .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .REQUIRED_ATTR(biases, ListFloat)
+    .ATTR(boxes, Int, 5)
+    .ATTR(coords, Int, 4)
+    .ATTR(classes, Int, 80)
+    .ATTR(relative, Bool, true)
+    .ATTR(obj_threshold, Float, 0.5)
+    .ATTR(post_nms_topn, Int, 1024)
+    .ATTR(score_threshold, Float, 0.5)
+    .ATTR(iou_threshold, Float, 0.45)
+    .ATTR(pre_nms_topn, Int, 512)
+    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(box_out_num, TensorType({DT_INT32}))
+    .OP_END_FACTORY_REG(YoloV2DetectionOutput)
+
+/**
+*@brief Performs YOLO V2 detection.
+
+*@par Inputs:
+*Six inputs, including:
+*@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov2DetectionOutput. \n
+Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
+*@li imginfo: A float16, describing the image information including the required image height and width \n
+and the actual image height and width.
+*@li windex: A windex tensor with shape [height, weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed. \n
+
+*@li hindex: A hindex tensor with shape [height, weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]]. \n
+
+*
+*@par Attributes:
+*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
+*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
+*@li coords: Specifies the number of coordinate parameters. Must be 4.
+*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80].
+*@li relative: An optional bool. Defaults to and must be "true".
+*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0].
+*@li post_nms_topn: An optional int32. This attribute is reserved.
+*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0].
+
+*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n
+*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024".
+*
+*@par Outputs:
+*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence.
+*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes.
+*
+*@attention Constraints:\n
+*@li This operator applies only to the YOLO v2 network.
+*@li The preceding layer of operator Yolov2DetectionOutput must be one Yolo operator.
+
+*@see Yolo()
+*/
+REG_OP(YoloV2DetectionOutputD)
+    .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(windex, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(hindex, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .REQUIRED_ATTR(biases, ListFloat)
+    .ATTR(boxes, Int, 5)
+    .ATTR(coords, Int, 4)
+    .ATTR(classes, Int, 80)
+    .ATTR(relative, Bool, true)
+    .ATTR(obj_threshold, Float, 0.5)
+    .ATTR(post_nms_topn, Int, 1024)
+    .ATTR(score_threshold, Float, 0.5)
+    .ATTR(iou_threshold, Float, 0.45)
+    .ATTR(pre_nms_topn, Int, 512)
+    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(box_out_num, TensorType({DT_INT32}))
+    .OP_END_FACTORY_REG(YoloV2DetectionOutputD)
+
+/**
+*@brief Performs YOLO V3 detection.
+
+*@par Inputs:
+*Ten inputs, including:
+*@li Operator Yolov3DetectionOutput takes the outputs of operator Yolo as its inputs. A Yolo operator has three outputs: "coords", "obj", and "class". \n
+There are three Yolo operators at Yolov3DetectionOutput's preceding layer on Yolo v3. For details, see the description of operator Yolo.
+*@li imginfo: A float16, describing the image information including the required image height and width \n
+and the actual image height and width.
+*
+*@par Attributes:
+*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
+*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
+*@li coords: Specifies the number of coordinate parameters. Must be 4.
+*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80].
+*@li relative: An optional bool. Defaults to and must be "true".
+*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0].
+
+*@li post_nms_topn: An optional int32. This attribute is reserved.
+*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0].
+
+*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n
+
+*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024".
+*
+*@par Outputs:
+*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence.
+*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes.
+
+*@attention Constraints:\n
+*@li This operator applies only to the YOLO v3 network.
+*@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators.
+
+*@see Yolo()
+*/
+REG_OP(YoloV3DetectionOutput)
+    .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .REQUIRED_ATTR(biases_low, ListFloat)
+    .REQUIRED_ATTR(biases_mid, ListFloat)
+    .REQUIRED_ATTR(biases_high, ListFloat)
+    .ATTR(boxes, Int, 3)
+    .ATTR(coords, Int, 4)
+    .ATTR(classes, Int, 80)
+    .ATTR(relative, Bool, true)
+    .ATTR(obj_threshold, Float, 0.5)
+    .ATTR(post_nms_topn, Int, 1024)
+    .ATTR(score_threshold, Float, 0.5)
+    .ATTR(iou_threshold, Float, 0.45)
+    .ATTR(pre_nms_topn, Int, 512)
+    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(box_out_num, TensorType({DT_INT32}))
+    .OP_END_FACTORY_REG(YoloV3DetectionOutput)
+
+/**
+*@brief Performs YOLO V3 detection.
+
+*@par Inputs:
+*16 Input, including:
+*@li The outputs of operator Yolo at the preceding layer (that is, three Yolo operators on YOLO v3) are used as the inputs of operator Yolov3DetectionOutput. \n
+A Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
+*@li imginfo: A float16, describing the image information including the required image height and width \n
+and the actual image height and width.
+*@li windex: A windex tensor with shape [height,weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed for the three Yolo outputs, respectively.
+
+*@li hindex: A hindex tensor with shape [height,weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]] is formed for the three Yolo outputs, respectively.
+
+*
+*@par Attributes:
+*@li biases: A required float32. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
+*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
+*@li coords: Specifies the number of coordinate parameters. Must be 4.
+*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80].
+*@li relative: An optional bool. Defaults to and must be "true".
+*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0].
+*@li post_nms_topn: An optional int32. This attribute is reserved.
+*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0].
+*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n
+*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024".
+*
+*@par Outputs:
+*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence.
+*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes.
+
+*@attention Constraints:\n
+*@li This operator applies only to the YOLO v3 network.
+*@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators.
+*@see Yolo()
+*/
+REG_OP(YoloV3DetectionOutputD)
+    .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(windex1, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(windex2, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(windex3, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(hindex1, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(hindex2, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .INPUT(hindex3, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .REQUIRED_ATTR(biases_low, ListFloat)
+    .REQUIRED_ATTR(biases_mid, ListFloat)
+    .REQUIRED_ATTR(biases_high, ListFloat)
+    .ATTR(boxes, Int, 3)
+    .ATTR(coords, Int, 4)
+    .ATTR(classes, Int, 80)
+    .ATTR(relative, Bool, true)
+    .ATTR(obj_threshold, Float, 0.5)
+    .ATTR(post_nms_topn, Int, 1024)
+    .ATTR(score_threshold, Float, 0.5)
+    .ATTR(iou_threshold, Float, 0.45)
+    .ATTR(pre_nms_topn, Int, 512)
+    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(box_out_num, TensorType({DT_INT32}))
+    .OP_END_FACTORY_REG(YoloV3DetectionOutputD)
+
+/**
+*@brief Spatial Pyramid Pooling, multi-level pooling.
+* Pooling out(n, sigma(c*2^i*2^i)) tensor, i in range[0,pyramid_height).
+
+*@par Inputs:
+*x: An NCHW tensor, support float16 or float32 type.
+
+*@par Attributes:
+* @li pyramid_height: An required int32.
+* Multi-level pooling out from 2^0 to 2^(pyramid_height-1).
+* @li pool_method: An optional int32, pooling method: 0-MAX, 1-AVE.
+* Defaults to "0".
+
+*@par Outputs:
+*y: A NCHW tensor, support float16 or float32 type.
+
+*@attention Constraints:
+* @li pyramid_height: pyramid_heigjt should be in range [0,7).
+* @li feature_size:input feture map h and w should be [1, 510].
+
+*/
+REG_OP(SPP)
+    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
+    .REQUIRED_ATTR(pyramid_height, Int)
+    .ATTR(pool_method, Int, 0)
+    .OP_END_FACTORY_REG(SPP)
+
 }  // namespace ge
 
 #endif  // GE_OP_NN_DETECT_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
index 10f3f369..87cc004c 100644
--- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
@@ -487,34 +487,6 @@ REG_OP(Upsample)
    .ATTR(stride_h, Int, 2)
    .ATTR(stride_w, Int, 2)
    .OP_END_FACTORY_REG(Upsample)
-
-/**
-*@brief Spatial Pyramid Pooling, multi-level pooling.
-* Pooling out(n, sigma(c*2^i*2^i)) tensor, i in range[0,pyramid_height).
-
-*@par Inputs:
-*x: An NCHW tensor, support float16 or float32 type.
-
-*@par Attributes:
-* @li pyramid_height: An required int32.
-* Multi-level pooling out from 2^0 to 2^(pyramid_height-1).
-* @li pool_method: An optional int32, pooling method: 0-MAX, 1-AVE.
-* Defaults to "0".
-
-*@par Outputs:
-*y: A NCHW tensor, support float16 or float32 type.
-
-*@attention Constraints:
-* @li pyramid_height: pyramid_heigjt should be in range [0,7).
-* @li feature_size:input feture map h and w should be [1, 510].
-
-*/
-REG_OP(SPP)
-    .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16}))
-    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16}))
-    .REQUIRED_ATTR(pyramid_height, Int)
-    .ATTR(pool_method, Int, 0)
-    .OP_END_FACTORY_REG(SPP)
 }  // namespace ge
 
 #endif  // GE_OP_NN_POOLING_OPS_H
diff --git a/third_party/fwkacllib/inc/ops/nn_training_ops.h b/third_party/fwkacllib/inc/ops/nn_training_ops.h
index 922869c3..88d1a913 100644
--- a/third_party/fwkacllib/inc/ops/nn_training_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_training_ops.h
@@ -164,7 +164,8 @@ REG_OP(SparseApplyAdagrad)
 *@li update_slots: An optional bool. Defaults to "True". If "True", the calcution will be different as "False".
 
 *@par Outputs:
-*var: A Tensor. Has the same type and format as input "var".
+*@li var: A Tensor. Has the same type and format as input "var".
+*@li accum: A Tensor. Has the same type and format as input "var".
 
 */
 REG_OP(SparseApplyAdagradD)
@@ -183,7 +184,7 @@ REG_OP(SparseApplyAdagradD)
 *@brief Updates relevant entries in "var" and "accum" according to the adagrad scheme.
 
 *@par Inputs:
-* Five inputs, including:
+*Six inputs, including:
 *@li var: An NCHW, NHWC, or ND Tensor of type float32.
 *@li accum: An NCHW, NHWC, or ND Tensor of type float32.
 *@li lr: An NCHW, NHWC, or ND Tensor of type float32.
@@ -215,7 +216,7 @@ REG_OP(SparseApplyAdagradV2)
 *@brief Updates relevant entries in "var" and "accum" according to the adagrad scheme.
 
 *@par Inputs:
-* Four inputs, including:
+*Four inputs, including:
 *@li var: An NCHW, NHWC, or ND Tensor of type float32.
 *@li accum: An NCHW, NHWC, or ND Tensor of type float32.
 *@li grad: An NCHW, NHWC, or ND Tensor of type float32.
@@ -228,8 +229,8 @@ REG_OP(SparseApplyAdagradV2)
 *@li update_slots: An optional bool. Defaults to "True". If "True", the calcution will be different as "False".
 
 *@par Outputs:
-*var: A Tensor. Has the same type and format as input "var".
-*accum: A Tensor. Has the same type and format as input "accum".
+*@li var: A Tensor. Has the same type and format as input "var".
+*@li accum: A Tensor. Has the same type and format as input "accum".
 
 */
 REG_OP(SparseApplyAdagradV2D)
@@ -299,6 +300,39 @@ REG_OP(ApplyMomentumCCE)
     .ATTR(use_locking, Bool, false)
     .OP_END_FACTORY_REG(ApplyMomentumCCE)
 
+/**
+*@brief Updates "var" according to the momentum scheme. Set use_nesterov = True if you
+*   want to use Nesterov momentum.\n
+*  computing process: \n
+*  accum = accum * momentum + grad\n
+*  var -= lr * accum
+*
+*@attention Constraints:\n
+*  the input tensors must have the same shape.
+*
+*@par Inputs:
+*@li var: A mutable tensor. Should be from a Variable().
+*@li accum: A mutable tensor. Has the same type as "var".
+*     Should be from a Variable().
+*@li lr: A scalar. Has the same type as "var".
+*@li grad: A tensor for the gradient. Has the same type as "var".
+*
+*@par Attributes:
+*@li use_nesterov: An optional bool. Defaults to "False".
+*     If "True", the tensor passed to compute grad will be
+*     var - lr * momentum * accum, so in the end, the var you get is actually
+*     var - lr * momentum * accum.
+*
+*@li use_locking: An optional bool. Defaults to "False".\n
+*     If "True", updating of the "var", "ms", and "mom" tensors is protected by a lock;
+*     otherwise the behavior is undefined, but may exhibit less contention.
+*
+*@par Outputs:
+* var: A mutable tensor. Has the same type as input "var".
+* accum: A mutable tensor. Has the same type as input "accum".
+*
+*/
+
 REG_OP(ApplyMomentumD)
     .INPUT(var, TensorType::NumberType())
     .INPUT(accum, TensorType::NumberType())
@@ -354,6 +388,51 @@ REG_OP(ApplyPowerSign)
     .OP_END_FACTORY_REG(ApplyPowerSign)
 
 /**
+*@brief Updates "var" according to the AddSign update.\n
+*  t-1 mean previous period.
+*  m_t <- beta1 * m_{t-1} + (1 - beta1) * grad\n
+*  update <- exp(logbase * sign_decay * sign(grad) * sign(m_t)) * grad\n
+*  var <- var - lr * update
+*
+*@attention Constraints:\n
+*  the input tensors must have the same shape.
+*
+*@par Inputs:
+*@li var: A mutable tensor. Should be from a Variable().
+*@li m: A mutable tensor. Has the same type as "var".
+*     Should be from a Variable().
+*@li lr: A scalar. Has the same type as "var".
+*@li logbase: A scalar. Has the same type as "var".
+*@li sign_decay: A scalar. Has the same type as "var".
+*@li beta: A scalar. Has the same type as "var".
+*@li grad: A tensor for the gradient. Has the same type as "var".
+*
+*@par Attributes:
+* use_locking: An optional bool. Defaults to "False".
+*     If "True", updating of the "var", "ms", and "mom" tensors is protected
+*     by a lock; otherwise the behavior is undefined, but may exhibit less
+*     contention.
+*
+*@par Outputs:
+*@li var: A mutable tensor. Has the same type as input "var".
+*@li m: A mutable tensor. Has the same type as input "var".
+*
+*
+*/
+REG_OP(ApplyPowerSignD)
+    .INPUT(var, TensorType::NumberType())
+    .INPUT(m, TensorType::NumberType())
+    .INPUT(lr, TensorType::NumberType())
+    .INPUT(logbase, TensorType::NumberType())
+    .INPUT(sign_decay, TensorType::NumberType())
+    .INPUT(beta, TensorType::NumberType())
+    .INPUT(grad, TensorType::NumberType())
+    .OUTPUT(var, TensorType::NumberType())
+    .OUTPUT(m, TensorType::NumberType())
+    .ATTR(use_locking, Bool, false)
+    .OP_END_FACTORY_REG(ApplyPowerSignD)
+
+/**
 *@brief Updates "var" as FOBOS algorithm with fixed learning rate.\n
 *  prox_v = var - alpha * delta\n
 *  var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
@@ -426,6 +505,46 @@ REG_OP(ApplyAddSign)
     .OP_END_FACTORY_REG(ApplyAddSign)
 
 /**
+*@brief Updates "var" according to the AddSign update.
+
+*@par Inputs:
+*Seven inputs, including:
+* @li var: A mutable Tensor of type TensorType::NumberType().
+*     Should be a Variable Tensor.
+* @li m: A mutable Tensor of the same type as "var".
+*     Should be a Variable Tensor.
+* @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
+* @li alpha: A Tensor of the same type as "var". Must be a scalar.
+* @li sign_decay: A Tensor of the same type as "var". Must be a scalar.
+* @li beta: A Tensor of the same type as "var". Must be a scalar.
+* @li grad: A Tensor of the same type as "var", for the gradient.
+
+
+*@par Attributes:
+*use_locking: An optional bool. Defaults to "False".
+*     If "True", updating of the "var" and "m" tensors will be
+*     protected by a lock; otherwise the behavior is undefined,
+*     but may exhibit less contention.
+
+*@par Outputs:
+*@li var: A mutable Tensor. Has the same type as "var".
+*@li m: A mutable Tensor. Has the same type as "m".
+
+*/
+REG_OP(ApplyAddSignD)
+    .INPUT(var, TensorType::NumberType())
+    .INPUT(m, TensorType::NumberType())
+    .INPUT(lr, TensorType::NumberType())
+    .INPUT(alpha, TensorType::NumberType())
+    .INPUT(sign_decay, TensorType::NumberType())
+    .INPUT(beta, TensorType::NumberType())
+    .INPUT(grad, TensorType::NumberType())
+    .OUTPUT(var, TensorType::NumberType())
+    .OUTPUT(m, TensorType::NumberType())
+    .ATTR(use_locking, Bool, false)
+    .OP_END_FACTORY_REG(ApplyAddSignD)
+
+/**
 *@brief Updates "var" according to the centered RMSProp algorithm.\n
 *  The centered RMSProp algorithm uses an estimate of the centered second moment
 *  (i.e., the variance) for normalization, as opposed to regular RMSProp, which
@@ -481,6 +600,70 @@ REG_OP(ApplyCenteredRMSProp)
     .OUTPUT(var, TensorType::NumberType())
     .ATTR(use_locking, Bool, false)
     .OP_END_FACTORY_REG(ApplyCenteredRMSProp)
+	
+/**
+*@brief Updates "var" according to the centered RMSProp algorithm.\n
+*  The centered RMSProp algorithm uses an estimate of the centered second moment
+*  (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+*  uses the (uncentered) second moment. This often helps with training, but is
+*  slightly more expensive in terms of computation and memory.
+*
+*  t-1 mean previous period.
+*  mg <- rho * mg{t-1} + (1-rho) * grad\n
+*  ms <- rho * ms{t-1} + (1-rho) * grad * grad\n
+*  mom <- momentum * mom{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)\n
+*  var <- var - mom\n
+*
+*@attention Constraints:\n
+*@li in dense implementation of this algorithm, mg, ms, and mom will
+*    update even if the grad is zero, but in this sparse implementation, mg, ms,
+*    and mom will not update in iterations during which the grad is zero.
+*@li the input tensors must have the same shape.
+*
+*@par Inputs:
+*@li var: A mutable tensor. Should be from a Variable().
+*@li mg: A mutable tensor. Has the same type as "var".
+*     Should be from a Variable().
+*@li ms: A mutable tensor. Has the same type as "var".
+*     Should be from a Variable().
+*@li mom: A mutable tensor. Has the same type as "var".
+*     Should be from a Variable().
+*@li lr: A scalar. Has the same type as "var".
+*@li rho: A scalar. Has the same type as "var".
+*@li momentum: A tensor. Has the same type as "var".
+*@li epsilon: A scalar. Has the same type as "var".
+*@li grad: A tensor for the gradient. Has the same type as "var".
+*
+*@par Attributes:
+* use_locking: An optional bool. Defaults to "False".
+*     If "True", updating of the "var", "ms", and "mom" tensors is protected
+*     by a lock; otherwise the behavior is undefined, but may exhibit less
+*     contention.
+*
+*@par Outputs:
+*@li var: A mutable Tensor. Has the same type as "var".
+*@li mg: A mutable Tensor. Has the same type as "mg".
+*@li ms: A mutable Tensor. Has the same type as "ms".
+*@li mom: A mutable Tensor. Has the same type as "mom".
+
+*
+*/
+REG_OP(ApplyCenteredRMSPropD)
+    .INPUT(var, TensorType::NumberType())
+    .INPUT(mg, TensorType::NumberType())
+    .INPUT(ms, TensorType::NumberType())
+    .INPUT(mom, TensorType::NumberType())
+    .INPUT(lr, TensorType::NumberType())
+    .INPUT(rho, TensorType::NumberType())
+    .INPUT(momentum, TensorType::NumberType())
+    .INPUT(epsilon, TensorType::NumberType())
+    .INPUT(grad, TensorType::NumberType())
+    .OUTPUT(var, TensorType::NumberType())
+    .OUTPUT(mg, TensorType::NumberType())
+    .OUTPUT(ms, TensorType::NumberType())
+    .OUTPUT(mom, TensorType::NumberType())
+    .ATTR(use_locking, Bool, false)
+    .OP_END_FACTORY_REG(ApplyCenteredRMSPropD)
 
 /**
 *@brief Updates "var" by subtracting 'alpha' * 'delta' from it.\n
@@ -590,29 +773,29 @@ REG_OP(ApplyAdagradD)
 *   accum += grad * grad \n
 *   var -= lr * grad * (1 / sqrt(accum) + epsilon)
 *
-* @attention Constraints:
-*  the input tensors must have the same shape.
-*
 * @par Inputs:
 * @li var: A mutable tensor. Must be one of the data types defined in
-*     TensorType::NumberType(). Should be from a Variable().
+* TensorType::NumberType(). Should be from a Variable().
 * @li accum: A mutable tensor. Has the same type as "var". Should be from a
-*     Variable().
+* Variable().
 * @li lr: A tensor for the learning rate. Has the same type as "var". Should be
-*     from a Variable().
+* from a Variable().
 * @li grad: A tensor for the gradient. Has the same type as "var". Should be
-*     from a Variable().
+* from a Variable().
 * @li epsilon: A scalar. Has the same type as "var".
 *
 * @par Attributes:
 * @li update_slots: An optional bool. Defaults to "True".
-*     If "True", accum will be updated
+* If "True", accum will be updated
 * @li use_locking: An optional bool. Defaults to "False".
-*     If "True", updating of the "var" tensor is protected by a lock;
-*     otherwise the behavior is undefined, but may exhibit less contention.
+* If "True", updating of the "var" tensor is protected by a lock;
+* otherwise the behavior is undefined, but may exhibit less contention.
 *
 * @par Outputs:
-*  var: A mutable tensor. Has the same type as input "var".
+* var: A mutable tensor. Has the same type as input "var".
+*
+* @attention Constraints:
+* The input tensors must have the same shape.
 *
 *
 */
@@ -630,32 +813,32 @@ REG_OP(ApplyAdagradV2)
 
 /**
 * @brief Updates "var" according to the adagradv2 scheme.\n
-*   accum += grad * grad \n
-*   var -= lr * grad * (1 / sqrt(accum) + epsilon)
-*
-* @attention Constraints:
-*  the input tensors must have the same shape.
+* accum += grad * grad \n
+* var -= lr * grad * (1 / sqrt(accum) + epsilon)
 *
 * @par Inputs:
 * @li var: A mutable tensor. Must be one of the data types defined in
-*     TensorType::NumberType(). Should be from a Variable().
+* TensorType::NumberType(). Should be from a Variable().
 * @li accum: A mutable tensor. Has the same type as "var". Should be from a
-*     Variable().
+* Variable().
 * @li lr: A tensor for the learning rate. Has the same type as "var". Should be
-*     from a Variable().
+* from a Variable().
 * @li grad: A tensor for the gradient. Has the same type as "var". Should be
-*     from a Variable().
+* from a Variable().
 *
 * @par Attributes:
 * @li epsilon: A scalar. Has the same type as "var".
 * @li update_slots: An optional bool. Defaults to "True".
-*     If "True", accum will be updated
+* If "True", accum will be updated
 * @li use_locking: An optional bool. Defaults to "False".
-*     If "True", updating of the "var" tensor is protected by a lock;
-*     otherwise the behavior is undefined, but may exhibit less contention.
+* If "True", updating of the "var" tensor is protected by a lock;
+* otherwise the behavior is undefined, but may exhibit less contention.
 *
 * @par Outputs:
-*  var: A mutable tensor. Has the same type as input "var".
+* var: A mutable tensor. Has the same type as input "var".
+*
+* @attention Constraints:
+* The input tensors must have the same shape.
 *
 *
 */
@@ -950,7 +1133,9 @@ REG_OP(ApplyRMSPropD)
 *use_locking: An optional bool. Defaults to "False". If "True", updating of the "var" and "accum" *tensors will be protected by a lock; otherwise the behavior is undefined, but may exhibit less *contention.
 
 *@par Outputs:
-*var: A mutable Tensor. Has the same type as "var".
+* @li var: A mutable tensor. Must have the same type as input "var".
+* @li ms:  A mutable tensor. Must have the same type as input "ms".
+* @li mom: A mutable tensor. Must have the same type as input "mom".
 */
 REG_OP(ApplyProximalAdagrad)
     .INPUT(var, TensorType::NumberType())
@@ -964,6 +1149,39 @@ REG_OP(ApplyProximalAdagrad)
     .OP_END_FACTORY_REG(ApplyProximalAdagrad)
 
 /**
+*@brief Update "var" and "accum" according to FOBOS with Adagrad learning rate.
+
+*@par Inputs:
+*Six inputs, including:
+* @li var: A mutable Tensor of type TensorType::NumberType().
+*    Should be from a Variable().
+* @li accum: A mutable Tensor of the same type as "var". Should be from a Variable().
+* @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
+* @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar.
+* @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar.
+* @li grad: A Tensor of the same type as "var", for the gradient.
+
+*@par Attributes:
+*use_locking: An optional bool. Defaults to "False". If "True", updating of the "var" and "accum" *tensors will be protected by a lock; otherwise the behavior is undefined, but may exhibit less *contention.
+
+*@par Outputs:
+* @li var: A mutable Tensor. Has the same type as "var".
+* @li accum: A mutable Tensor. Has the same type as "var".
+
+*/
+REG_OP(ApplyProximalAdagradD)
+    .INPUT(var, TensorType::NumberType())
+    .INPUT(accum, TensorType::NumberType())
+    .INPUT(lr, TensorType::NumberType())
+    .INPUT(l1, TensorType::NumberType())
+    .INPUT(l2, TensorType::NumberType())
+    .INPUT(grad, TensorType::NumberType())
+    .OUTPUT(var, TensorType::NumberType())
+    .OUTPUT(accum, TensorType::NumberType())
+    .ATTR(use_locking, Bool, false)
+    .OP_END_FACTORY_REG(ApplyProximalAdagradD)
+
+/**
 *@brief Updates entries in 'var' and 'accum' according to the Proximal Adagrad algorithm.\ n
 * Compared with op ApplyProximalAdagrad, an additional index tensor is input,
 * Only the indices into the first dimensions of "var" and "accum" are updated.
@@ -1006,6 +1224,51 @@ REG_OP(SparseApplyProximalAdagrad)
     .OP_END_FACTORY_REG(SparseApplyProximalAdagrad)
 
 /**
+*@brief Updates entries in 'var' and 'accum' according to the Proximal Adagrad algorithm.\ n
+* Compared with op ApplyProximalAdagrad, an additional index tensor is input,
+* Only the indices into the first dimensions of "var" and "accum" are updated.
+
+*@par Inputs:
+* Seven inputs, including:\n
+* @li var: A mutable Tensor.\n
+*     TensorType::NumberType(). Should be a Variable Tensor.
+* @li accum: A mutable Tensor of the same type as "var".\n
+*     Should be a Variable Tensor.
+* @li lr: A Tensor of the same type as "var".\n
+*     Scaling factor. Must be a scalar.
+* @li l1: A Tensor of the same type as "var".\n
+*     L1 regulariation. Must be a scalar.
+* @li l2: A Tensor of the same type as "var".\n
+*     L2 regulariation. Must be a scalar.
+* @li grad: A Tensor. Has the same type as "var". \n
+*     The gradient.
+* @li indices: A vector of indices into the first dimension of "var" and "accum".\n
+*     TensorType::IndexNumberType().
+
+*@par Attributes:
+*use_locking: An optional bool. Defaults to "False".\n
+*     If "True", updating of the var and accum tensors will be protected by a lock; \n
+*     If "False", the behavior is undefined, but may exhibit less contention.
+
+*@par Outputs:
+*@li var: A mutable Tensor. Has the same type as "var".
+*@li accum:  A mutable Tensor. Has the same type as "var".
+
+*/
+REG_OP(SparseApplyProximalAdagradD)
+    .INPUT(var, TensorType::NumberType())
+    .INPUT(accum, TensorType::NumberType())
+    .INPUT(lr, TensorType::NumberType())
+    .INPUT(l1, TensorType::NumberType())
+    .INPUT(l2, TensorType::NumberType())
+    .INPUT(grad, TensorType::NumberType())
+    .INPUT(indices, TensorType::IndexNumberType())
+    .OUTPUT(var, TensorType::NumberType())
+    .OUTPUT(accum, TensorType::NumberType())
+    .ATTR(use_locking, Bool, false)
+    .OP_END_FACTORY_REG(SparseApplyProximalAdagradD)
+
+/**
 *@brief Updates "var" according to the Ftrl-proximal scheme.
 
 *@par Inputs:
@@ -1045,6 +1308,50 @@ REG_OP(ApplyFtrl)
     .OP_END_FACTORY_REG(ApplyFtrl)
 
 /**
+*@brief Updates "var" according to the Ftrl-proximal scheme.
+
+*@par Inputs:
+*Eight inputs, including:
+* @li var: A mutable Tensor. Must be of type TensorType::NumberType().
+*     Should be a Variable Tensor.
+* @li accum: A mutable Tensor of the same type as "var".
+*     Should be a Variable Tensor.
+* @li linear: A mutable Tensor of the same type as "var".
+*     Should be a Variable Tensor.
+* @li grad: A Tensor of the same type as "var", for the gradient.
+* @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
+* @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar.
+* @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar.
+* @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
+
+*@par Attributes:
+*use_locking: An optional bool. Defaults to "False".
+*     If "True", updating of the "var" and "accum" tensors will be
+*     protected by a lock; otherwise the behavior is undefined,
+*     but may exhibit less contention.
+
+*@par Outputs:
+*@li var: A mutable Tensor. Has the same type as "var".
+*@li accum: A mutable Tensor. Has the same type as "accum".
+*@li linear: A mutable Tensor. Has the same type as "linear".
+
+*/
+REG_OP(ApplyFtrlD)
+    .INPUT(var, TensorType::NumberType())
+    .INPUT(accum, TensorType::NumberType())
+    .INPUT(linear, TensorType::NumberType())
+    .INPUT(grad, TensorType::NumberType())
+    .INPUT(lr, TensorType::NumberType())
+    .INPUT(l1, TensorType::NumberType())
+    .INPUT(l2, TensorType::NumberType())
+    .INPUT(lr_power, TensorType::NumberType())
+    .OUTPUT(var, TensorType::NumberType())
+    .OUTPUT(accum, TensorType::NumberType())
+    .OUTPUT(linear, TensorType::NumberType())
+    .ATTR(use_locking, Bool, false)
+    .OP_END_FACTORY_REG(ApplyFtrlD)
+
+/**
 *@brief Update "var" according to the Ftrl-proximal scheme.
 
 *@par Inputs:
@@ -1086,6 +1393,52 @@ REG_OP(ApplyFtrlV2)
     .OP_END_FACTORY_REG(ApplyFtrlV2)
 
 /**
+*@brief Update "var" according to the Ftrl-proximal scheme.
+
+*@par Inputs:
+*Nine inputs, including:
+* @li var: A mutable Tensor. Must be of type TensorType::NumberType().
+*     Should be a Variable Tensor.
+* @li accum: A mutable Tensor of the same type as "var".
+*     Should be a Variable Tensor.
+* @li linear: A mutable Tensor of the same type as "var".
+*     Should be a Variable Tensor.
+* @li grad: A Tensor of the same type as "var", for the gradient.
+* @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
+* @li l1: A Tensor of the same type as "var", for L1 regulariation. Must be a scalar.
+* @li l2: A Tensor of the same type as "var", for L2 regulariation. Must be a scalar.
+* @li l2_shrinkage: A Tensor of the same type as "var".
+* @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
+
+*@par Attributes:
+*use_locking: An optional bool. Defaults to "False".
+*     If "True", updating of the "var" and "accum" tensors will be
+*     protected by a lock; otherwise the behavior is undefined,
+*     but may exhibit less contention.
+
+*@par Outputs:
+*var: A mutable Tensor. Has the same type as "var".
+*accum: A mutable Tensor. Has the same type as "accum".
+*linear: A mutable Tensor. Has the same type as "linear".
+
+*/
+REG_OP(ApplyFtrlV2D)
+    .INPUT(var, TensorType::NumberType())
+    .INPUT(accum, TensorType::NumberType())
+    .INPUT(linear, TensorType::NumberType())
+    .INPUT(grad, TensorType::NumberType())
+    .INPUT(lr, TensorType::NumberType())
+    .INPUT(l1, TensorType::NumberType())
+    .INPUT(l2, TensorType::NumberType())
+    .INPUT(l2_shrinkage, TensorType::NumberType())
+    .INPUT(lr_power, TensorType::NumberType())
+    .OUTPUT(var, TensorType::NumberType())
+    .OUTPUT(accum, TensorType::NumberType())
+    .OUTPUT(linear, TensorType::NumberType())
+    .ATTR(use_locking, Bool, false)
+    .OP_END_FACTORY_REG(ApplyFtrlV2D)
+
+/**
 *@brief Updates "var" according to the Adam algorithm.\n
 *  lr_t <- text{learning\_rate} * sqrt{1 - beta_2^t} / (1 - beta_1^t)\n
 *  m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g\n
@@ -1137,6 +1490,45 @@ REG_OP(ApplyAdam)
     .ATTR(use_nesterov, Bool, false)
     .OP_END_FACTORY_REG(ApplyAdam)
 
+/**
+*@brief Updates "var" according to the Adam algorithm.\n
+*  lr_t <- text{learning\_rate} * sqrt{1 - beta_2^t} / (1 - beta_1^t)\n
+*  m_t <- beta_1 * m_{t-1} + (1 - beta_1) * g\n
+*  v_t <- max(beta2 * v{t-1}, abs(g))\n
+*  variable <- variable - lr_t * m_t / (sqrt{v_t} + epsilon)
+*
+*@attention Constraints:\n
+*  *The input tensors must have the same shape.*
+*
+*@par Inputs:
+*@li var: A mutable Tensor of the type TensorType::NumberType().
+*     Should be from a Variable().
+*@li m: A mutable Tensor of the same type as "var".
+*     Should be from a Variable().
+*@li v: A mutable Tensor of the same type as "var".
+*     Should be from a Variable().
+*@li beta1_power: A scalar of the same type as "var".
+*@li beta2_power: A scalar of the same type as "var".
+*@li lr: learning_rate. A scalar of the same type as "var".
+*@li beta1: A scalar of the same type as "var".
+*@li beta2: A scalar of the same type as "var".
+*@li epsilon: A scalar of the same type as "var".
+*@li grad: A Tensor of the same type as "var", for the gradient.
+*
+*@par Attributes:\n
+*@li use_locking: An optional bool. Defaults to "False".
+*     If "True", updating of the "var", m", and "v" tensors will be protected
+*     by a lock; otherwise the behavior is undefined, but may exhibit less
+*     contention.
+*@li use_nesterov: An optional bool. Defaults to "False".
+      If "True", uses the nesterov update.
+*
+*@par Outputs:
+*@li var: A mutable tensor. Has the same type as input "var".
+*@li m: A mutable tensor. Has the same type as input "m".
+*@li v: A mutable tensor. Has the same type as input "v".
+
+*/
 REG_OP(ApplyAdamD)
     .INPUT(var, TensorType::NumberType())
     .INPUT(m, TensorType::NumberType())
@@ -1154,6 +1546,7 @@ REG_OP(ApplyAdamD)
     .ATTR(use_locking, Bool, false)
     .ATTR(use_nesterov, Bool, false)
     .OP_END_FACTORY_REG(ApplyAdamD)
+
 /**
 *@brief Updates "var" according to the proximal adadelta scheme.
 
@@ -1401,11 +1794,11 @@ REG_OP(LarsV2Update)
 * @par Inputs:
 * Nine inputs, including:
 * @li var: A mutable Tensor. Must be of type TensorType::NumberType().
-*     Should be a Variable Tensor.
+* Should be a Variable Tensor.
 * @li accum: A mutable Tensor of the same type as "var".
-*     Should be a Variable Tensor.
+* Should be a Variable Tensor.
 * @li linear: A mutable Tensor of the same type as "var".
-*     Should be a Variable Tensor.
+* Should be a Variable Tensor.
 * @li grad: A Tensor of the same type as "var", for the gradient.
 * @li indices: A vector of indices into the first dimension of var and accum.
 * @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
@@ -1415,9 +1808,9 @@ REG_OP(LarsV2Update)
 
 * @par Attributes:
 * use_locking: An optional bool. Defaults to "False".
-*     If "True", updating of the "var" and "accum" tensors will be
-*     protected by a lock; otherwise the behavior is undefined,
-*     but may exhibit less contention.
+* If "True", updating of the "var" and "accum" tensors will be
+* protected by a lock; otherwise the behavior is undefined,
+* but may exhibit less contention.
 
 * @par Outputs:
 * var: A Tensor. Has the same type and format as input "var".
@@ -1441,13 +1834,13 @@ REG_OP(SparseApplyFtrl)
 * @brief Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 
 * @par Inputs:
-* Nine inputs, including:
+* Five inputs, including:
 * @li var: A mutable Tensor. Must be of type TensorType::NumberType().
-*     Should be a Variable Tensor.
+* Should be a Variable Tensor.
 * @li accum: A mutable Tensor of the same type as "var".
-*     Should be a Variable Tensor.
+* Should be a Variable Tensor.
 * @li linear: A mutable Tensor of the same type as "var".
-*     Should be a Variable Tensor.
+* Should be a Variable Tensor.
 * @li grad: A Tensor of the same type as "var", for the gradient.
 * @li indices: A vector of indices into the first dimension of var and accum.
 * @li lr: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
@@ -1457,14 +1850,14 @@ REG_OP(SparseApplyFtrl)
 
 * @par Attributes:
 * use_locking: An optional bool. Defaults to "False".
-*     If "True", updating of the "var" and "accum" tensors will be
-*     protected by a lock; otherwise the behavior is undefined,
-*     but may exhibit less contention.
+* If "True", updating of the "var" and "accum" tensors will be
+* protected by a lock; otherwise the behavior is undefined,
+* but may exhibit less contention.
 
 * @par Outputs:
-* var: A Tensor. Has the same type and format as input "var".
-* accum: A Tensor. Has the same type and format as input "accum".
-* linear: A Tensor. Has the same type and format as input "linear".
+* @li var: A Tensor. Has the same type and format as input "var".
+* @li accum: A Tensor. Has the same type and format as input "accum".
+* @li linear: A Tensor. Has the same type and format as input "linear".
 
 */
 REG_OP(SparseApplyFtrlD)
@@ -1533,13 +1926,13 @@ REG_OP(SparseApplyFtrlV2)
 * That is for rows we have grad for, we update var, accum and linear
 
 * @par Inputs:
-* Ten inputs, including:
+* Five inputs, including:
 * @li var: A mutable Tensor. Must be of type TensorType::NumberType().
-*     Should be a Variable Tensor.
+* Should be a Variable Tensor.
 * @li accum: A mutable Tensor of the same type as "var".
-*     Should be a Variable Tensor.
+* Should be a Variable Tensor.
 * @li linear: A mutable Tensor of the same type as "var".
-*     Should be a Variable Tensor.
+* Should be a Variable Tensor.
 * @li grad: A Tensor of the same type as "var", for the gradient.
 * @li indices: A vector of indices into the first dimension of var and accum.
 
@@ -1550,14 +1943,14 @@ REG_OP(SparseApplyFtrlV2)
 * @li l2_shrinkage: A Tensor of the same type as "var", L2 shrinkage regulariation. Must be a scalar.
 * @li lr_power: A Tensor of the same type as "var", for the scaling factor. Must be a scalar.
 * @li use_locking: An optional bool. Defaults to "False".
-*     If "True", updating of the "var" and "accum" tensors will be
-*     rotected by a lock; otherwise the behavior is undefined,
-*     but may exhibit less contention.
+* If "True", updating of the "var" and "accum" tensors will be
+* rotected by a lock; otherwise the behavior is undefined,
+* but may exhibit less contention.
 
 * @par Outputs:
-* var: A Tensor. Has the same type and format as input "var".
-* accum: A Tensor. Has the same type and format as input "accum".
-* linear: A Tensor. Has the same type and format as input "linear".
+* @li var: A Tensor. Has the same type and format as input "var".
+* @li accum: A Tensor. Has the same type and format as input "accum".
+* @li linear: A Tensor. Has the same type and format as input "linear".
 
 */
 REG_OP(SparseApplyFtrlV2D)
@@ -1578,6 +1971,109 @@ REG_OP(SparseApplyFtrlV2D)
     .OP_END_FACTORY_REG(SparseApplyFtrlV2D)
 
 /**
+* @brief Updates "var" in specified index according to the RMSProp algorithm.
+*    mean_square = decay * mean_square + (1-decay) * gradient ** 2\n
+*    Delta = learning_rate * gradient / sqrt(mean_square + epsilon)\n
+*    ms <- rho * ms_{t-1} + (1-rho) * grad * grad\n
+*    mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\n
+*    var <- var - mom\n
+*
+* @par Inputs:
+* @li var: A mutable tensor. Must be one of the data types defined in\n
+* TensorType::NumberType(). Should be from a Variable().
+* @li ms: A mutable tensor. Must have the same type as "var". Should be from a
+* Variable().
+* @li mom: A mutable tensor. Must have the same type as "var". Should be from a
+* Variable().
+* @li lr: A scalar. Must have the same type as "var".
+* @li rho: A scalar. Must have the same type as "var".
+* @li momentum: A scalar. Must have the same type as "var".
+* @li epsilon: A scalar. Must have the same type as "var".
+* @li grad: A tensor, specifying the gradient.
+* @li indices: A vector of indices into the first dimension of var, mom and ms.
+*
+* @par Attributes:
+* use_locking: An optional "bool". Defaults to "False". If "True", updating of
+* the "var", "ms", and "mom" tensors will be protected by a lock; otherwise the
+* behavior is undefined, but may exhibit less contention.
+*
+* @par Outputs:
+* var: A mutable tensor. Has the same type as input "var".
+*
+* @attention Constraints:
+* @li Note that in this sparse implementation, "ms" and "mom" will not update
+* in iterations during which "grad" is 0.
+* @li The input tensors "var", "ms", "mom" must have the same shape.
+*
+*/
+REG_OP(SparseApplyRMSProp)
+    .INPUT(var, TensorType::NumberType())
+    .INPUT(ms, TensorType::NumberType())
+    .INPUT(mom, TensorType::NumberType())
+    .INPUT(lr, TensorType::NumberType())
+    .INPUT(rho, TensorType::NumberType())
+    .INPUT(momentum, TensorType::NumberType())
+    .INPUT(epsilon, TensorType::NumberType())
+    .INPUT(grad, TensorType::NumberType())
+    .INPUT(indices, TensorType::IndexNumberType())
+    .OUTPUT(var, TensorType::NumberType())
+    .ATTR(use_locking, Bool, false)
+    .OP_END_FACTORY_REG(SparseApplyRMSProp)
+
+/**
+* @brief Updates "var" in specified index according to the RMSProp algorithm.
+* a const input will be considered as an attribute.\n
+*     mean_square = decay * mean_square + (1-decay) * gradient ** 2\n
+*     Delta = learning_rate * gradient / sqrt(mean_square + epsilon)\n
+*     ms <- rho * ms_{t-1} + (1-rho) * grad * grad\n
+*     mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)\n
+*     var <- var - mom
+*
+* @par Inputs:
+* @li var: A mutable tensor. Must be one of the data types defined in
+* TensorType::NumberType(). Should be from a Variable().
+* @li ms: A mutable tensor. Must have the same type as "var". Should be from a
+* Variable().
+* @li mom: A mutable tensor. Must have the same type as "var". Should be from a
+* Variable().
+* @li lr: A scalar. Must have the same type as "var".
+* @li grad: A tensor, specifying the gradient.
+*
+* @par Attributes:
+* @li use_locking: An optional "bool". Defaults to "False". If "True",
+* updating of the "var", "ms", and "mom" tensors will be protected by a lock;
+* otherwise the behavior is undefined, but may exhibit less contention.
+* @li rho: A required scalar. Must have the same type as "var".
+* @li momentum: A required scalar. Must have the same type as "var".
+* @li epsilon: A required scalar. Must have the same type as "var".
+*
+* @par Outputs:
+* @li var: A mutable tensor. Must have the same type as input "var".
+* @li ms:  A mutable tensor. Must have the same type as input "ms".
+* @li mom: A mutable tensor. Must have the same type as input "mom".
+*
+* @attention Constraints:
+* @li Note that in this sparse implementation, "ms" and "mom" will not update
+* in iterations during which "grad" is 0.
+* @li The input tensors "var", "ms" and "mom" must have the same shape.
+*/
+REG_OP(SparseApplyRMSPropD)
+    .INPUT(var, TensorType::NumberType())
+    .INPUT(ms, TensorType::NumberType())
+    .INPUT(mom, TensorType::NumberType())
+    .INPUT(lr, TensorType::NumberType())
+    .INPUT(grad, TensorType::NumberType())
+    .INPUT(indices, TensorType::IndexNumberType())
+    .OUTPUT(var, TensorType::NumberType())
+    .OUTPUT(ms, TensorType::NumberType())
+    .OUTPUT(mom, TensorType::NumberType())
+    .REQUIRED_ATTR(rho, Float)
+    .REQUIRED_ATTR(momentum, Float)
+    .REQUIRED_ATTR(epsilon, Float)
+    .ATTR(use_locking, Bool, false)
+    .OP_END_FACTORY_REG(SparseApplyRMSPropD)
+
+/**
 *@brief Clean memory of workspace list.
 
 *@par Attributes:
diff --git a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
index 992077ad..46d29b8d 100644
--- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
+++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
@@ -172,24 +172,6 @@ REG_OP(SigmoidGrad)
     .OUTPUT(z, TensorType(UnaryDataType))
     .OP_END_FACTORY_REG(SigmoidGrad)
 
-REG_OP(Activation)
-    .INPUT(x, TensorType::ALL())
-    .OUTPUT(y, TensorType::ALL())
-    /*
-       0: sigmod, 1: relu, 2: tanh, 3: clipped ReLU, 4: Elu,
-       5: leaky relu, 6: abs, 7: relu1, 8: softsign, 9: softplus
-    */
-    .ATTR(mode, Int, 1)
-    .ATTR(coef, Float, 0)
-    .OP_END_FACTORY_REG(Activation)
-
-REG_OP(ActivationGrad)
-    .INPUT(dy, TensorType{DT_FLOAT})
-    .INPUT(x, TensorType{DT_FLOAT})
-    .OUTPUT(dx, TensorType{DT_FLOAT})
-    .ATTR(mode, Int, 1)
-    .OP_END_FACTORY_REG(ActivationGrad)
-
 /**
 *@brief Computes the binomial normal log likelihood (BNLL) output:\n
 *if x>0, x+log(1+exp(-x)); otherwise log(1+exp(x)).
diff --git a/third_party/fwkacllib/inc/ops/power_ops.h b/third_party/fwkacllib/inc/ops/power_ops.h
deleted file mode 100644
index b1f5bc24..00000000
--- a/third_party/fwkacllib/inc/ops/power_ops.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * Copyright 2019-2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #ifndef GE_OP_POWER_H
- #define GE_OP_POWER_H
-
- #include "../graph/operator_reg.h"
-
- namespace ge {
-
-/**
-*@brief Computes the output as (shift + scale * x) ^ power.
-
-*@par Inputs:
-* x: A Tensor of type float16 or float32.
-
-*@par Attributes:
-*@li power: Optional. Defaults to 1.0.
-*@li scale: Optional. Defaults to 1.0.
-*@li shift: Optional. Defaults to 0.0.
-
-*@par Outputs:
-* y: A Tensor. Has the same type and shape as "x".
-*/
-
- REG_OP(Power)
-     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
-     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT}))
-     .ATTR(power, Float, 1.0)
-     .ATTR(scale, Float, 1.0)
-     .ATTR(shift, Float, 0.0)
-     .OP_END_FACTORY_REG(Power);
-
- } // namespace ge
-
- #endif // GE_OP_POWER_H
diff --git a/third_party/fwkacllib/inc/ops/quantize_ops.h b/third_party/fwkacllib/inc/ops/quantize_ops.h
index 235f2645..e44ae888 100644
--- a/third_party/fwkacllib/inc/ops/quantize_ops.h
+++ b/third_party/fwkacllib/inc/ops/quantize_ops.h
@@ -19,22 +19,6 @@
 #include "../graph/operator_reg.h"
 
 namespace ge {
-REG_OP(QuantizedInnerProduct)
-    .INPUT(x, TensorType({DT_UINT8}))
-    .INPUT(w, TensorType({DT_INT8}))
-    .OPTIONAL_INPUT(b, TensorType({DT_INT32}))
-    .OPTIONAL_INPUT(scale_q, TensorType({DT_FLOAT16}))
-    .OPTIONAL_INPUT(offset_q, TensorType({DT_FLOAT16}))
-    .OPTIONAL_INPUT(scale_deq_req, TensorType({DT_FLOAT16}))
-    .OPTIONAL_INPUT(offset_req, TensorType({DT_FLOAT16}))
-    .OUTPUT(y, TensorType({DT_FLOAT16}))
-    .REQUIRED_ATTR(quant_algo, ListInt)
-    .REQUIRED_ATTR(scale_sqrt, ListInt)
-    .REQUIRED_ATTR(num_output, Int)
-    .ATTR(transpose, Bool, false)
-    .ATTR(bias_term, Bool, false)
-    .ATTR(axis, Int, 1)
-    .OP_END_FACTORY_REG(QuantizedInnerProduct)
 
 /**
 * @brief Dequantizes the input tensor into a float tensor.\n
diff --git a/third_party/fwkacllib/inc/ops/ragged_array_ops.h b/third_party/fwkacllib/inc/ops/ragged_array_ops.h
index 245f3551..4f3cf97e 100644
--- a/third_party/fwkacllib/inc/ops/ragged_array_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_array_ops.h
@@ -45,12 +45,10 @@ namespace ge {
 
 REG_OP(RaggedGather)
     .DYNAMIC_INPUT(params_nested_splits, TensorType({DT_INT32, DT_INT64}))
-    .INPUT(params_dense_values, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \
-            DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64, DT_BOOL}))
+    .INPUT(params_dense_values, TensorType({DT_INT32, DT_INT64}))
     .INPUT(indices, TensorType({DT_INT32, DT_INT64}))
     .DYNAMIC_OUTPUT(output_nested_splits, TensorType({DT_INT32, DT_INT64}))
-    .OUTPUT(output_dense_values, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \
-            DT_UINT16, DT_INT32, DT_UINT32, DT_INT64, DT_UINT64, DT_BOOL}))
+    .OUTPUT(output_dense_values, TensorType({DT_INT32, DT_INT64}))
     .REQUIRED_ATTR(Tsplits, Type)
     .ATTR(PARAMS_RAGGED_RANK, Int, 1)
     .ATTR(OUTPUT_RAGGED_RANK, Int, 0)
diff --git a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
index 8e07bdc5..7a42e4d9 100644
--- a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
@@ -50,5 +50,43 @@ REG_OP(RaggedTensorToSparse)
     .ATTR(RAGGED_RANK, Int, 1)
     .ATTR(Tsplits, Type, DT_INT64)
     .OP_END_FACTORY_REG(RaggedTensorToSparse)
+
+/**
+*@brief Create a dense tensor from a ragged tensor, possibly altering its shape.
+
+*@par Inputs:
+*Six inputs, including:
+*@li shape:A `Tensor`. Must be one of the following types: `int64`, `int32`.
+*@li values:A 1D tensor representing the values of the ragged tensor.
+*@li default_value:A `Tensor`. Must have the same type as `values`.
+*@li row_partition_tensors:A list of at least 1 `Tensor` objects with the same \n
+type in: `int64`, `int32`.
+
+*@par Attributes:
+*@li num_row_partition_tensors:Numbers of row partition tensors.
+*@li row_partition_types: A list of `strings`. \n
+The types of the row partition tensors. At present, these can be: \n
+* "ROW_SPLITS": the row_splits tensor from the ragged tensor. \n
+* "VALUE_ROWIDS": the value_rowids tensor from the ragged tensor. \n
+* "FIRST_DIM_SIZE": if value_rowids is used for the first dimension, then it \n
+is preceeded by "FIRST_DIM_SIZE".
+
+*@par Outputs:
+*@li result: A `Tensor`. Has the same type as `values`.
+*/
+REG_OP(RaggedTensorToTensor)
+    .INPUT(shape, TensorType({DT_INT32, DT_INT64}))
+    .INPUT(values, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16,
+                          DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16}))
+    .INPUT(default_value, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16,
+              DT_UINT16, DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16}))
+    .DYNAMIC_INPUT(row_partition_tensors, TensorType({DT_INT32, DT_INT64}))
+    .OUTPUT(result, TensorType({DT_BOOL, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16,
+                          DT_INT32, DT_INT64, DT_DOUBLE, DT_FLOAT, DT_FLOAT16}))
+    .REQUIRED_ATTR(num_row_partition_tensors, Int)
+    .REQUIRED_ATTR(row_partition_types, ListString)
+    .OP_END_FACTORY_REG(RaggedTensorToTensor)
+
+
 } // namespace ge
 #endif // GE_OP_RAGGED_CONVERSION_OPS_H
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/ops/ragged_math_ops.h b/third_party/fwkacllib/inc/ops/ragged_math_ops.h
index 51797ff8..80669f0f 100644
--- a/third_party/fwkacllib/inc/ops/ragged_math_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_math_ops.h
@@ -41,11 +41,11 @@ namespace ge {
 */
 
 REG_OP(RaggedRange)
-    .INPUT(starts, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64}))
-    .INPUT(limits, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64}))
-    .INPUT(deltas, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64}))
+    .INPUT(starts, TensorType({DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64}))
+    .INPUT(limits, TensorType({DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64}))
+    .INPUT(deltas, TensorType({DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64}))
     .OUTPUT(rt_nested_splits, TensorType({DT_INT32, DT_INT64}))
-    .OUTPUT(rt_dense_values, TensorType({DT_FLOAT16,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64}))
+    .OUTPUT(rt_dense_values, TensorType({DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64}))
     .REQUIRED_ATTR(Tsplits, Type)
     .OP_END_FACTORY_REG(RaggedRange)
 
diff --git a/third_party/fwkacllib/inc/ops/rnn.h b/third_party/fwkacllib/inc/ops/rnn.h
index abd98695..7a6aaa9e 100644
--- a/third_party/fwkacllib/inc/ops/rnn.h
+++ b/third_party/fwkacllib/inc/ops/rnn.h
@@ -180,15 +180,15 @@ REG_OP(RNN)
     .OPTIONAL_INPUT(x_static, TensorType({DT_FLOAT16}))
     .OPTIONAL_INPUT(h_0, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(w_xh, TensorType({DT_FLOAT16}))
+    .INPUT(bias_h, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(w_sh, TensorType({DT_FLOAT16}))
     .INPUT(w_hh, TensorType({DT_FLOAT16}))
     .INPUT(w_ho, TensorType({DT_FLOAT16}))
-    .INPUT(bias_h, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(bias_o, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OUTPUT(o, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OUTPUT(h_t, TensorType({DT_FLOAT16, DT_FLOAT}))
-    .ATTR(expose_hidden, Bool, false)
     .ATTR(num_output, Int, 0)
+    .ATTR(expose_hidden, Bool, false)
     .OP_END_FACTORY_REG(RNN)
 
 /**
@@ -220,9 +220,9 @@ REG_OP(BasicRNNCell)
     .OPTIONAL_INPUT(w_xh_x_static, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OPTIONAL_INPUT(h_0, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(w_xh, TensorType({DT_FLOAT16}))
+    .INPUT(bias_h, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OPTIONAL_INPUT(w_hh, TensorType({DT_FLOAT16}))
     .INPUT(w_ho, TensorType({DT_FLOAT16}))
-    .INPUT(bias_h, TensorType({DT_FLOAT16, DT_FLOAT}))
     .INPUT(bias_o, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OUTPUT(o_t, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OUTPUT(h_t, TensorType({DT_FLOAT16, DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/sdca_ops.h b/third_party/fwkacllib/inc/ops/sdca_ops.h
index 3f1e938a..15428d2b 100644
--- a/third_party/fwkacllib/inc/ops/sdca_ops.h
+++ b/third_party/fwkacllib/inc/ops/sdca_ops.h
@@ -64,7 +64,7 @@ REG_OP(SdcaOptimizerV2)
     .INPUT(example_weights, TensorType({DT_FLOAT}))
     .INPUT(example_labels, TensorType({DT_FLOAT}))
     .DYNAMIC_INPUT(sparse_indices, TensorType({DT_INT64}))
-    .DYNAMIC_INPUT(sparse_weights, TensorType({DT_INT64}))
+    .DYNAMIC_INPUT(sparse_weights, TensorType({DT_FLOAT}))
     .DYNAMIC_INPUT(dense_weights, TensorType({DT_FLOAT}))
     .INPUT(example_state_data, TensorType({DT_FLOAT}))
     .OUTPUT(out_example_state_data, TensorType({DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/selection_ops.h b/third_party/fwkacllib/inc/ops/selection_ops.h
index dab71025..c7b59caa 100644
--- a/third_party/fwkacllib/inc/ops/selection_ops.h
+++ b/third_party/fwkacllib/inc/ops/selection_ops.h
@@ -240,7 +240,7 @@ REG_OP(GatherV2D)
 REG_OP(StridedSlice)
     .INPUT(x, TensorType::BasicType())
     .INPUT(begin, TensorType::IndexNumberType())
-    .INPUT(end, TensorType::IndexNumberTypeT())
+    .INPUT(end, TensorType::IndexNumberType())
     .INPUT(strides, TensorType::IndexNumberType())
     .ATTR(begin_mask, Int, 0)
     .ATTR(end_mask, Int, 0)
@@ -571,7 +571,7 @@ REG_OP(SegmentMax)
 
 *@par Outputs:
 *y:A Tensor with same type as "x".
-*/  
+*/
 REG_OP(SegmentMaxD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
@@ -703,6 +703,7 @@ REG_OP(SliceD)
 * @attention Constraints:
 * @li k =< 4096
 * @li Size of the last dimension =< 65500
+* @li sorted = true
 */
 REG_OP(TopKD)
     .INPUT(x, TensorType::RealNumberType())
@@ -1309,174 +1310,6 @@ REG_OP(UnsortedSegmentProdD)
     .OP_END_FACTORY_REG(UnsortedSegmentProdD)
 
 /**
-*@brief Normalizes data. It is called Region on YOLO v2 and Yolo on YOLO v3.
-
-*@par Inputs:
-*x: An NCHW tensor of type float16 or float32. The data is with shape (N, boxes*(coords+obj+classes), H, W),where, "obj" indicates the confidence of an object, and only one confidence is supported. Boxes are arranged as xx...xyy...yww...whh...hbb...bc0c0..c0c1c1...c1......cncn...cn.
-
-*@par Attributes:
-*@li boxes: A required int32, specifying the number of anchor boxes. Defaults to "5" for V2 or "3" for V3.
-*@li coords: An int32, specifying the number of parameters required for locating an object. The value is fixed at "4", corresponding to (x,y,w,h).
-*@li classes: An int32, specifying the number of prediction classes. Defaults to "80". The value range is [1, 1024].
-*@li yolo_version: A string, specifying the YOLO version, either "V2" or "V3".
-*@li softmax: A bool, specifying whether to perform softmax, valid only when "yolo_version = V2".
-*@li background: A bool, specifying the operation types of the obj and classes, used in conjunction with "softmax" and valid only when "yolo_version = V2".
-*@li background: A bool.
-
-*@par Outputs:
-*@li coord_data: A float16 or float32 with shape [N, boxes*coords, ceilx(height*width*2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the coordinates of a detected box.
-*@li obj_prob: A float16 or float32 with shape [N, ceilx(boxes*height*width *2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the confidence.
-*@li classes_prob: A float16 or float32 with shape [N, classes, ceilx(boxes*height*width *2+32, 32)/2], where "ceil" indicates that a detected box is aligned upwards with the second parameter. Specifies the prediction classes.
-
-*@attention Constraints:
-*@li This operator applies to YOLO v2 and v3 networks.
-*@li The succeeding layer of the Yolo operator must be operator Yolov3DetectionOutput.
-*/
-REG_OP(Yolo)
-    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .OUTPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .OUTPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .OUTPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .ATTR(boxes, Int, 3)
-    .ATTR(coords, Int, 4)
-    .ATTR(classes, Int, 80)
-    .ATTR(yolo_version, String, "V3")
-    .ATTR(softmax, Bool, false)
-    .ATTR(background, Bool, false)
-    .ATTR(softmaxtree, Bool, false)
-    .OP_END_FACTORY_REG(Yolo)
-
-/**
-*@brief Performs YOLO V3 detection.
-
-*@par Inputs:
-*Ten inputs, including:
-*@li Operator Yolov3DetectionOutput takes the outputs of operator Yolo as its inputs. A Yolo operator has three outputs: "coords", "obj", and "class". \n
-There are three Yolo operators at Yolov3DetectionOutput's preceding layer on Yolo v3. For details, see the description of operator Yolo.
-*@li imginfo: A float16, describing the image information including the required image height and width \n
-and the actual image height and width.
-*
-*@par Attributes:
-*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
-*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
-*@li coords: Specifies the number of coordinate parameters. Must be 4.
-*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80].
-*@li relative: An optional bool. Defaults to and must be "true".
-*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0].
-
-*@li post_nms_topn: An optional int32. This attribute is reserved.
-*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0].
-
-*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n
-
-*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024".
-*
-*@par Outputs:
-*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence.
-*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes.
-
-*@attention Constraints:\n
-*@li This operator applies only to the YOLO v3 network.
-*@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators.
-
-*@see Yolo()
-*/
-REG_OP(YoloV3DetectionOutput)
-    .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .REQUIRED_ATTR(biases_low, ListFloat)
-    .REQUIRED_ATTR(biases_mid, ListFloat)
-    .REQUIRED_ATTR(biases_high, ListFloat)
-    .ATTR(boxes, Int, 3)
-    .ATTR(coords, Int, 4)
-    .ATTR(classes, Int, 80)
-    .ATTR(relative, Bool, true)
-    .ATTR(obj_threshold, Float, 0.5)
-    .ATTR(post_nms_topn, Int, 1024)
-    .ATTR(score_threshold, Float, 0.5)
-    .ATTR(iou_threshold, Float, 0.45)
-    .ATTR(pre_nms_topn, Int, 512)
-    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .OUTPUT(box_out_num, TensorType({DT_INT32}))
-    .OP_END_FACTORY_REG(YoloV3DetectionOutput)
-
-/**
-*@brief Performs YOLO V3 detection.
-
-*@par Inputs:
-*16 Input, including:
-*@li The outputs of operator Yolo at the preceding layer (that is, three Yolo operators on YOLO v3) are used as the inputs of operator Yolov3DetectionOutput. \n
-A Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
-*@li imginfo: A float16, describing the image information including the required image height and width \n
-and the actual image height and width.
-*@li windex: A windex tensor with shape [height,weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed for the three Yolo outputs, respectively.
-
-*@li hindex: A hindex tensor with shape [height,weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]] is formed for the three Yolo outputs, respectively.
-
-*
-*@par Attributes:
-*@li biases: A required float32. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
-*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
-*@li coords: Specifies the number of coordinate parameters. Must be 4.
-*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80].
-*@li relative: An optional bool. Defaults to and must be "true".
-*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0].
-*@li post_nms_topn: An optional int32. This attribute is reserved.
-*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0].
-*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n
-*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024".
-*
-*@par Outputs:
-*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence.
-*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes.
-
-*@attention Constraints:\n
-*@li This operator applies only to the YOLO v3 network.
-*@li The preceding layer of operator Yolov3DetectionOutput must be three Yolo operators.
-*@see Yolo()
-*/
-REG_OP(YoloV3DetectionOutputD)
-    .INPUT(coord_data_low, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(coord_data_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(coord_data_high, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(obj_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(obj_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(obj_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(classes_prob_low, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(classes_prob_mid, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(classes_prob_high, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(windex1, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(windex2, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(windex3, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(hindex1, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(hindex2, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(hindex3, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .REQUIRED_ATTR(biases_low, ListFloat)
-    .REQUIRED_ATTR(biases_mid, ListFloat)
-    .REQUIRED_ATTR(biases_high, ListFloat)
-    .ATTR(boxes, Int, 3)
-    .ATTR(coords, Int, 4)
-    .ATTR(classes, Int, 80)
-    .ATTR(relative, Bool, true)
-    .ATTR(obj_threshold, Float, 0.5)
-    .ATTR(post_nms_topn, Int, 1024)
-    .ATTR(score_threshold, Float, 0.5)
-    .ATTR(iou_threshold, Float, 0.45)
-    .ATTR(pre_nms_topn, Int, 512)
-    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .OUTPUT(box_out_num, TensorType({DT_INT32}))
-    .OP_END_FACTORY_REG(YoloV3DetectionOutputD)
- 
-/**
 *@brief Performs object detection.
 
 *@par Inputs:
@@ -1555,116 +1388,6 @@ REG_OP(ProposalD)
      .OP_END_FACTORY_REG(ProposalD)
 
 /**
-*@brief Performs YOLO V2 detection.
-
-*@par Inputs:
-* Four inputs, including:
-*@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov3DetectionOutput. \n
-Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
-*@li imginfo: A float16, describing the image information including the required image height and width \n
-and the actual image height and width.
-*
-*@par Attributes:
-*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
-*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
-*@li coords: Specifies the number of coordinate parameters. Must be 4.
-*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80].
-*@li relative: An optional bool. Defaults to and must be "true".
-*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0].
-
-*@li post_nms_topn: An optional int32. This attribute is reserved.
-*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0].
-*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n
-*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024".
-*
-*@par Outputs:
-*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence.
-*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes.
-
-*@attention Constraints:\n
-*@li This operator applies only to the YOLO v2 network.
-*@li The preceding layer of operator Yolov2DetectionOutput must be one Yolo operator.
-
-*@see Yolo()
-*/
-REG_OP(YoloV2DetectionOutput)
-    .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .REQUIRED_ATTR(biases, ListFloat)
-    .ATTR(boxes, Int, 5)
-    .ATTR(coords, Int, 4)
-    .ATTR(classes, Int, 80)
-    .ATTR(relative, Bool, true)
-    .ATTR(obj_threshold, Float, 0.5)
-    .ATTR(post_nms_topn, Int, 1024)
-    .ATTR(score_threshold, Float, 0.5)
-    .ATTR(iou_threshold, Float, 0.45)
-    .ATTR(pre_nms_topn, Int, 512)
-    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .OUTPUT(box_out_num, TensorType({DT_INT32}))
-    .OP_END_FACTORY_REG(YoloV2DetectionOutput)
-
-/**
-*@brief Performs YOLO V2 detection.
-
-*@par Inputs:
-*Six inputs, including:
-*@li The outputs of operator Yolo at the preceding layer (that is, one Yolo operator on YOLO v2) are used as the inputs of operator Yolov2DetectionOutput. \n
-Each Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
-*@li imginfo: A float16, describing the image information including the required image height and width \n
-and the actual image height and width.
-*@li windex: A windex tensor with shape [height, weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed. \n
-
-*@li hindex: A hindex tensor with shape [height, weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]]. \n
-
-*
-*@par Attributes:
-*@li biases: A required float. "biases = Number of Yolo operators at the preceding layer x 2 x boxes"
-*@li boxes: A required int32, specifying the number of anchor boxes predicted for each Yolo layer.
-*@li coords: Specifies the number of coordinate parameters. Must be 4.
-*@li classes: A required int32, specifying the number of classes to be predicted. The value range is [1, 80].
-*@li relative: An optional bool. Defaults to and must be "true".
-*@li obj_threshold: A required float, specifying the confidence threshold for box filtering, which is the output "obj" of operator Yolo). The value range is [0.0, 1.0].
-*@li post_nms_topn: An optional int32. This attribute is reserved.
-*@li score_threshold: A required float, specifying the class score threshold for box filtering, which is the output "class" of operator Yolo). The value range is [0.0, 1.0].
-
-*@li iou_threshold: A required float, specifying the intersection-over-union (IOU) threshold for box filtering. The value range is [0.0, 1.0].\n
-*@li pre_nms_topn: An optional int, specifying the number of boxes for non-maximum suppression (NMS). Defaults to "1024".
-*
-*@par Outputs:
-*@li boxout: An NCHW tensor of type float16, describing the information of each output box, including the coordinates, class, and confidence.
-*@li boxoutnum: An NCHW tensor of type int32, specifying the number of output boxes.
-*
-*@attention Constraints:\n
-*@li This operator applies only to the YOLO v2 network.
-*@li The preceding layer of operator Yolov2DetectionOutput must be one Yolo operator.
-
-*@see Yolo()
-*/
-REG_OP(YoloV2DetectionOutputD)
-    .INPUT(coord_data, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(obj_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(classes_prob, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(img_info, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(windex, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .INPUT(hindex, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .REQUIRED_ATTR(biases, ListFloat)
-    .ATTR(boxes, Int, 5)
-    .ATTR(coords, Int, 4)
-    .ATTR(classes, Int, 80)
-    .ATTR(relative, Bool, true)
-    .ATTR(obj_threshold, Float, 0.5)
-    .ATTR(post_nms_topn, Int, 1024)
-    .ATTR(score_threshold, Float, 0.5)
-    .ATTR(iou_threshold, Float, 0.45)
-    .ATTR(pre_nms_topn, Int, 512)
-    .OUTPUT(box_out, TensorType({DT_FLOAT16,DT_FLOAT}))
-    .OUTPUT(box_out_num, TensorType({DT_INT32}))
-    .OP_END_FACTORY_REG(YoloV2DetectionOutputD)
-
-/**
 *@brief Performs plane or channel conversion on YoloV2.
 * If reverse=true: (N, H, W, C)->(N, H*stride, W*stride, C/(stride*stride))
 * If reverse=false: (N, H, W, C)->(N, H/stride, W/stride, C*(stride*stride))
diff --git a/third_party/fwkacllib/inc/ops/sparse_ops.h b/third_party/fwkacllib/inc/ops/sparse_ops.h
index 87f44a54..5c50298c 100644
--- a/third_party/fwkacllib/inc/ops/sparse_ops.h
+++ b/third_party/fwkacllib/inc/ops/sparse_ops.h
@@ -215,7 +215,7 @@ REG_OP(SparseDenseCwiseMul)
 REG_OP(AddSparseToTensorsMap)
     .INPUT(indices, TensorType({DT_INT64}))
     .INPUT(values, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, \
-        DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE \
+        DT_UINT16, DT_UINT8, DT_INT32, DT_INT64, DT_BOOL, DT_DOUBLE, \
         DT_COMPLEX64, DT_COMPLEX128, DT_RESOURCE, DT_STRING}))
     .INPUT(shape, TensorType({DT_INT64}))
     .OUTPUT(handle, TensorType({DT_INT64}))
@@ -410,8 +410,6 @@ REG_OP(SparseToDense)
 * @li y_indices:A `Tensor` of type `int64`.
 * @li y_values:A `Tensor`. Has the same type as `values`.
 * @li y_shape:A `Tensor` of type `int64`.
-
-* Compatible SparseConcat operator in Tensorflow
 */
 REG_OP(SparseConcat)
     .DYNAMIC_INPUT(indices, TensorType({DT_INT64}))
@@ -452,8 +450,6 @@ REG_OP(SparseConcat)
 * @li sum_indices:A `Tensor` of type `int64`.
 * @li sum_values:A `Tensor`. Has the same type as `x1_values`.
 * @li sum_shape:A `Tensor` of type `int64`.
-
-* Compatible SparseAdd operator in Tensorflow
 */
 REG_OP(SparseAdd)
     .INPUT(x1_indices, TensorType({DT_INT64}))
@@ -489,8 +485,6 @@ REG_OP(SparseAdd)
 * @li y_values:A `Tensor`. Has the same type as `values`.
 * @li empty_row_indicator:A `Tensor` of type `bool`.
 * @li reverse_index_map:A `Tensor` of type `int64`.
-
-* Compatible SparseFillEmptyRows operator in Tensorflow
 */
 REG_OP(SparseFillEmptyRows)
     .INPUT(indices, TensorType({DT_INT64}))
@@ -529,8 +523,6 @@ REG_OP(SparseFillEmptyRows)
 *@par Outputs:
 * @li y_indices:A `Tensor` of type `int64`.
 * @li y_values:A `Tensor`. Has the same type as `x1_values`.
-
-* Compatible SparseSparseMaximum operator in Tensorflow
 */
 REG_OP(SparseSparseMaximum)
     .INPUT(x1_indices, TensorType({DT_INT64}))
@@ -564,8 +556,6 @@ REG_OP(SparseSparseMaximum)
 *@par Outputs:
 * @li y_indices:A `Tensor` of type `int64`.
 * @li y_values:A `Tensor`. Has the same type as `x1_values`.
-
-* Compatible SparseSparseMinimum operator in Tensorflow
 */
 REG_OP(SparseSparseMinimum)
     .INPUT(x1_indices, TensorType({DT_INT64}))
@@ -604,8 +594,6 @@ REG_OP(SparseSparseMinimum)
 
 *@par Outputs:
 * y:A `Tensor`. Has the same type as `input_values`.
-
-* Compatible SparseReduceMax operator in Tensorflow
 */
 REG_OP(SparseReduceMax)
     .INPUT(x_indices, TensorType({DT_INT64}))
@@ -640,8 +628,6 @@ REG_OP(SparseReduceMax)
 * @li y_indices:A `Tensor` of type `int64`.
 * @li y_values:A `Tensor`. Has the same type as `input_values`.
 * @li y_shape:A `Tensor` of type `int64`.
-
-* Compatible SparseReduceMaxSparse operator in Tensorflow
 */
 REG_OP(SparseReduceMaxSparse)
     .INPUT(x_indices, TensorType({DT_INT64}))
@@ -854,7 +840,7 @@ REG_OP(AddManySparseToTensorsMap)
 * The "N" serialized SparseTensor objects.
 
 *@par Attributes:
-* @li dtype: A tf.DType. The "dtype" of the SparseTensor objects stored in the "SparseTensorsMap".
+* @li dtype: A DType. The "dtype" of the SparseTensor objects stored in the "SparseTensorsMap".
 * @li container: An optional string. Defaults to "". \n
 *The container name for the "SparseTensorsMap" read by this op.
 * @li shared_name: An optional string. Defaults to "". \n
diff --git a/third_party/fwkacllib/inc/ops/stateful_random_ops.h b/third_party/fwkacllib/inc/ops/stateful_random_ops.h
index 929481d5..9ba09dd6 100644
--- a/third_party/fwkacllib/inc/ops/stateful_random_ops.h
+++ b/third_party/fwkacllib/inc/ops/stateful_random_ops.h
@@ -87,9 +87,9 @@ smaller than the range of the output (either `2^32` or `2^64`).
 REG_OP(StatefulRandomBinomial)
     .INPUT(x, TensorType({DT_RESOURCE}))
     .INPUT(algorithm, TensorType({DT_INT64}))
-    .INPUT(shape, TensorType({DT_INT32, DT_INT64}))
-    .INPUT(counts, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64}))
-    .INPUT(probs, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64}))
+    .INPUT(shape, TensorType({DT_INT32}))
+    .INPUT(counts, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .INPUT(probs, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64}))
     .REQUIRED_ATTR(dtype, Type)
     .OP_END_FACTORY_REG(StatefulRandomBinomial)
@@ -111,7 +111,7 @@ REG_OP(StatefulRandomBinomial)
 REG_OP(StatefulStandardNormalV2)
     .INPUT(x, TensorType({DT_RESOURCE}))
     .INPUT(algorithm, TensorType({DT_INT64}))
-    .INPUT(shape, TensorType({DT_INT64}))
+    .INPUT(shape, TensorType({DT_INT32,DT_INT64}))
     .OUTPUT(y, TensorType({DT_FLOAT}))
     .OP_END_FACTORY_REG(StatefulStandardNormalV2)
 
@@ -134,7 +134,7 @@ REG_OP(StatefulStandardNormalV2)
 REG_OP(StatefulTruncatedNormal)
     .INPUT(x, TensorType({DT_RESOURCE}))
     .INPUT(algorithm, TensorType({DT_INT64}))
-    .INPUT(shape, TensorType({DT_INT64}))
+    .INPUT(shape, TensorType({DT_INT32,DT_INT64}))
     .OUTPUT(y, TensorType({DT_FLOAT}))
     .OP_END_FACTORY_REG(StatefulTruncatedNormal)
 
@@ -156,7 +156,7 @@ lower bound 0 is included in the range, while the upper bound 1 is excluded. \n
 REG_OP(StatefulUniform)
     .INPUT(x, TensorType({DT_RESOURCE}))
     .INPUT(algorithm, TensorType({DT_INT64}))
-    .INPUT(shape, TensorType({DT_INT64}))
+    .INPUT(shape, TensorType({DT_INT32,DT_INT64}))
     .OUTPUT(y, TensorType({DT_FLOAT}))
     .OP_END_FACTORY_REG(StatefulUniform)
 
@@ -177,8 +177,8 @@ The generated values are uniform integers covering the whole range of `dtype`.
 REG_OP(StatefulUniformFullInt)
     .INPUT(x, TensorType({DT_RESOURCE}))
     .INPUT(algorithm, TensorType({DT_INT64}))
-    .INPUT(shape, TensorType({DT_INT64}))
-    .OUTPUT(y, TensorType({DT_INT64}))
+    .INPUT(shape, TensorType({DT_INT32,DT_INT64}))
+    .OUTPUT(y, TensorType({DT_UINT64}))
     .OP_END_FACTORY_REG(StatefulUniformFullInt)
 
 /**
@@ -205,7 +205,7 @@ smaller than the range of the output (either `2^32` or `2^64`).
 REG_OP(StatefulUniformInt)
     .INPUT(x, TensorType({DT_RESOURCE}))
     .INPUT(algorithm, TensorType({DT_INT64}))
-    .INPUT(shape, TensorType({DT_INT64}))
+    .INPUT(shape, TensorType({DT_INT32,DT_INT64}))
     .INPUT(minval, TensorType({DT_INT64}))
     .INPUT(maxval, TensorType({DT_INT64}))
     .OUTPUT(y, TensorType({DT_INT64}))
diff --git a/third_party/fwkacllib/inc/ops/string_ops.h b/third_party/fwkacllib/inc/ops/string_ops.h
index 9b87817f..1b88fbd0 100644
--- a/third_party/fwkacllib/inc/ops/string_ops.h
+++ b/third_party/fwkacllib/inc/ops/string_ops.h
@@ -127,7 +127,7 @@ include: \n
 *inputs are trusted or unimportant. There is a risk of adversaries\n
 *constructing inputs that all hash to the same bucket.\n
 *To prevent this problem, use a strong hash function with\n
-*tf.string_to_hash_bucket_strong.
+*string_to_hash_bucket_strong.
 
 *@see Substr()
 
@@ -155,7 +155,7 @@ include: \n
 *This function may be used when CPU time is scarce and inputs are trusted or\n
 *unimportant. There is a risk of adversaries constructing inputs that all hash\n
 *to the same bucket. To prevent this problem, use a strong hash function with\n
-*tf.string_to_hash_bucket_strong.
+*string_to_hash_bucket_strong.
 
 *@see StringToHashBucketFast()
 
@@ -187,7 +187,7 @@ include: \n
 * hash value distribution over buckets. This requires that the hash function\
 *is seeded by a high-entropy (random) "key" unknown to the adversary.
 *@li The additional robustness comes at a cost of roughly 4x higher\n
-*compute time than tf.string_to_hash_bucket_fast.
+*compute time than string_to_hash_bucket_fast.
 
 *@see StringToHashBucketStrong()
 
diff --git a/third_party/fwkacllib/inc/ops/transformation_ops.h b/third_party/fwkacllib/inc/ops/transformation_ops.h
index 689cde4e..69dd450f 100644
--- a/third_party/fwkacllib/inc/ops/transformation_ops.h
+++ b/third_party/fwkacllib/inc/ops/transformation_ops.h
@@ -400,8 +400,8 @@ REG_OP(Unpack)
 * "ksizes", "strides" and "rates" are lists of integers.
 */
 REG_OP(ExtractImagePatches)
-    .INPUT(x, TensorType::REALNUMBERTYPE())
-    .OUTPUT(y, TensorType::REALNUMBERTYPE())
+    .INPUT(x, TensorType::RealNumberType())
+    .OUTPUT(y, TensorType::RealNumberType())
     .REQUIRED_ATTR(ksizes, ListInt)
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(rates, ListInt)
@@ -409,6 +409,37 @@ REG_OP(ExtractImagePatches)
     .OP_END_FACTORY_REG(ExtractImagePatches)
 
 /**
+* @brief Extract "patches" from "input" and put them in the "depth"
+* dimension of the output.
+
+* @par Inputs:
+* x: A 5D Tensor with shape [batch, in_planes, in_rows, in_cols, depth].
+
+* @par Attributes:
+* @li ksizes: A required list or tuple. The size of the sliding window for each
+* dimension of "x".
+* @li strides: A required list or tuple. How far the centers of two consecutive
+* patches are in "x". Must be: [1, stride_planes, stride_rows, stride_cols, 1].
+* @li padding: A required string. The type of padding algorithm to use.
+
+* @par Outputs:
+* Output: A 5D Tensor with shape [batch, out_planes, out_rows, out_cols, ksize_planes * \n
+* ksize_rows * ksize_cols * depth] containing patches with size (ksize_rows * ksize_cols\n
+* * depth) vectorized in the "depth" dimension. Note "out_planes", "out_rows" and "out_cols"\n
+* are the dimensions of the output patches.
+
+* @attention Constraints:
+* "ksizes" and "strides" are lists of integers.
+*/
+REG_OP(ExtractVolumePatches)
+    .INPUT(x, TensorType::REALNUMBERTYPE())
+    .OUTPUT(y, TensorType::REALNUMBERTYPE())
+    .REQUIRED_ATTR(ksizes, ListInt)
+    .REQUIRED_ATTR(strides, ListInt)
+    .REQUIRED_ATTR(padding, String)
+    .OP_END_FACTORY_REG(ExtractVolumePatches)
+
+/**
 *@brief Confuse reshape and transpose.
 
 *@par Inputs:
@@ -466,7 +497,7 @@ REG_OP(ConfusionTranspose)
 *y: The flattened ND tensor. All data types are supported.
 
 *@attention Constraints:
-* "axis" and "end_axis" must be within the dimension range of the input.
+* "axis" and "end_axis" must be within the dimension range of the input. This operator cannot be directly called by the acllopExecute API.
 */
 REG_OP(FlattenV2)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_INT16, DT_UINT16,
diff --git a/third_party/fwkacllib/inc/register/op_kernel_registry.h b/third_party/fwkacllib/inc/register/op_kernel_registry.h
index 47bdca07..2c479e92 100644
--- a/third_party/fwkacllib/inc/register/op_kernel_registry.h
+++ b/third_party/fwkacllib/inc/register/op_kernel_registry.h
@@ -18,7 +18,8 @@
 #define INC_REGISTER_OP_KERNEL_REGISTRY_H_
 #include <memory>
 #include <string>
-#include "register/register.h"
+#include "register/register_types.h"
+#include "register.h"
 
 namespace ge {
 class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpKernelRegistry {
@@ -40,7 +41,6 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY OpKernelRegistry {
  private:
   OpKernelRegistry();
   class OpKernelRegistryImpl;
-  /*lint -e148*/
   std::unique_ptr<OpKernelRegistryImpl> impl_;
 };
 } // namespace ge
diff --git a/third_party/fwkacllib/inc/register/register.h b/third_party/fwkacllib/inc/register/register.h
new file mode 100644
index 00000000..27da0b0b
--- /dev/null
+++ b/third_party/fwkacllib/inc/register/register.h
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef INC_REGISTER_REGISTRY_H_
+#define INC_REGISTER_REGISTRY_H_
+
+#include "external/register/register.h"
+
+namespace ge {
+class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOp {
+ public:
+  HostCpuOp() = default;
+  virtual ~HostCpuOp() = default;
+
+  virtual graphStatus Compute(Operator &op,
+                              const std::map<std::string, const Tensor> &inputs,
+                              std::map<std::string, Tensor> &outputs) = 0;
+};
+
+class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY HostCpuOpRegistrar {
+ public:
+  HostCpuOpRegistrar(const char *op_type, HostCpuOp *(*create_fn)());
+  ~HostCpuOpRegistrar() = default;
+};
+
+#define REGISTER_HOST_CPU_OP_BUILDER(name, op) \
+    REGISTER_HOST_CPU_OP_BUILDER_UNIQ_HELPER(__COUNTER__, name, op)
+
+#define REGISTER_HOST_CPU_OP_BUILDER_UNIQ_HELPER(ctr, name, op) \
+    REGISTER_HOST_CPU_OP_BUILDER_UNIQ(ctr, name, op)
+
+#define REGISTER_HOST_CPU_OP_BUILDER_UNIQ(ctr, name, op)              \
+  static ::ge::HostCpuOpRegistrar register_host_cpu_op##ctr           \
+      __attribute__((unused)) =                                       \
+          ::ge::HostCpuOpRegistrar(name, []()->::ge::HostCpuOp* {   \
+            return new (std::nothrow) op();                           \
+          })
+} // namespace ge
+
+#endif //INC_REGISTER_REGISTRY_H_
diff --git a/third_party/fwkacllib/inc/runtime/kernel.h b/third_party/fwkacllib/inc/runtime/kernel.h
index 1609519f..c99eb96f 100644
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@@ -448,7 +448,7 @@ RTS_API rtError_t rtSubscribeReport(uint64_t threadId, rtStream_t stream);
  * @param [in] stream   subscribed stream
  * @return RT_ERROR_NONE for ok, errno for failed
  */
-RTS_API rtError_t rtCallbackLaunch(rtCallback_t callBackFunc, void *fnData, rtStream_t stream);
+RTS_API rtError_t rtCallbackLaunch(rtCallback_t callBackFunc, void *fnData, rtStream_t stream, bool isBlock);
 
 /**
  * @ingroup rt_kernel
diff --git a/third_party/fwkacllib/inc/runtime/mem.h b/third_party/fwkacllib/inc/runtime/mem.h
index b55530a1..93b7585a 100644
--- a/third_party/fwkacllib/inc/runtime/mem.h
+++ b/third_party/fwkacllib/inc/runtime/mem.h
@@ -17,9 +17,7 @@
 #ifndef __CCE_RUNTIME_MEM_H__
 #define __CCE_RUNTIME_MEM_H__
 
-/*lint -e7*/
 #include <stddef.h>
-/*lint +e7*/
 #include "base.h"
 #include "config.h"
 #include "stream.h"
@@ -77,6 +75,8 @@ typedef enum tagRtMemcpyKind {
   RT_MEMCPY_DEVICE_TO_HOST,    // device to host
   RT_MEMCPY_DEVICE_TO_DEVICE,  // device to device, 1P && P2P
   RT_MEMCPY_MANAGED,           // managed memory
+  RT_MEMCPY_ADDR_DEVICE_TO_DEVICE,
+  RT_MEMCPY_HOST_TO_DEVICE_EX, // host  to device ex (only used for 8 bytes)
   RT_MEMCPY_RESERVED,
 } rtMemcpyKind_t;
 
diff --git a/third_party/fwkacllib/inc/runtime/rt_model.h b/third_party/fwkacllib/inc/runtime/rt_model.h
index 1e03e853..d4e5682b 100644
--- a/third_party/fwkacllib/inc/runtime/rt_model.h
+++ b/third_party/fwkacllib/inc/runtime/rt_model.h
@@ -45,7 +45,8 @@ typedef enum tagModelTaskType {
   RT_MODEL_TASK_EVENT_RESET = 18,
   RT_MODEL_TASK_MODEL_END_GRAPH,
   RT_MODEL_TASK_STREAM_SWITCH_N,
-  RT_MODEL_TASK_RDMA_DB_SEND
+  RT_MODEL_TASK_RDMA_DB_SEND,
+  RT_MODEL_TASK_MEMCPY_ADDR_ASYNC
 } rtModelTaskType_t;
 
 typedef enum tagModelStreamType { 
diff --git a/third_party/fwkacllib/inc/toolchain/slog.h b/third_party/fwkacllib/inc/toolchain/slog.h
index 1fb9aff2..2728c812 100644
--- a/third_party/fwkacllib/inc/toolchain/slog.h
+++ b/third_party/fwkacllib/inc/toolchain/slog.h
@@ -168,6 +168,7 @@ enum {
   DSS,
   PROCMGR,     // Process Manager, Base Platform
   BBOX,
+  AIVECTOR,
   INVLID_MOUDLE_ID
 };
 
@@ -241,6 +242,7 @@ static DCODE g_moduleIdName[] = {SET_MOUDLE_ID_MAP_NAME(SLOG),
                                  SET_MOUDLE_ID_MAP_NAME(DSS),
                                  SET_MOUDLE_ID_MAP_NAME(PROCMGR),
                                  SET_MOUDLE_ID_MAP_NAME(BBOX),
+                                 SET_MOUDLE_ID_MAP_NAME(AIVECTOR),
                                  { NULL, -1 }};
 #endif // MODULE_ID_NAME
 
diff --git a/third_party/fwkacllib/version.info b/third_party/fwkacllib/version.info
index 0e65dd04..8bc7f6e0 100644
--- a/third_party/fwkacllib/version.info
+++ b/third_party/fwkacllib/version.info
@@ -1 +1 @@
-Version=1.60.T49.0.B201
+Version=1.71.T6.0.B070