diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ae9c88f..457fa086 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,7 @@ elseif(DEFINED ENV{D_LINK_PATH})
     find_library(slog libslog.so ${GE_LIB_PATH})
     find_library(mmpa libmmpa.so ${GE_LIB_PATH})
     find_library(runtime libruntime.so ${GE_LIB_PATH})
-    find_library(msprof libmsprof.so ${GE_LIB_PATH})
+    find_library(msprof libmsprofiler.a ${GE_LIB_PATH})
     find_library(register libregister.so ${GE_LIB_PATH})
     find_library(hccl libhccl.so ${GE_LIB_PATH})
     find_library(resource libresource.so ${GE_LIB_PATH})
@@ -85,7 +85,7 @@ else()
     set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
     find_library(slog libslog.so ${ASCEND_DRIVER_DIR})
     find_library(mmpa libmmpa.so ${ASCEND_DRIVER_DIR})
-    find_library(msprof libmsprof.so ${ASCEND_DRIVER_DIR})
+    find_library(msprof libmsprofiler.a ${ASCEND_RUNTIME_DIR})
 
     find_library(hccl libhccl.so ${ASCEND_RUNTIME_DIR})
     find_library(runtime libruntime.so ${ASCEND_RUNTIME_DIR})
diff --git a/inc/framework/common/string_util.h b/inc/framework/common/string_util.h
index 918a3950..3e4bf093 100644
--- a/inc/framework/common/string_util.h
+++ b/inc/framework/common/string_util.h
@@ -61,8 +61,10 @@ class StringUtils {
   ///  @param [in] delim  separator
   ///  @return string array after segmentation
   ///
+  /*lint -e1077*/
   static std::vector<std::string> Split(const std::string &str, char delim) {
     std::vector<std::string> elems;
+    /*lint +e1077*/
 
     if (str.empty()) {
       elems.emplace_back("");
diff --git a/inc/framework/omg/omg_inner_types.h b/inc/framework/omg/omg_inner_types.h
index 2f91d7aa..e1a7da0b 100644
--- a/inc/framework/omg/omg_inner_types.h
+++ b/inc/framework/omg/omg_inner_types.h
@@ -92,6 +92,9 @@ struct OmgContext {
   std::map<std::string, std::vector<int32_t>> out_nodes_map;
   // user-designate out nodes (this is used for determing the orders)
   std::vector<std::pair<std::string, int32_t>> user_out_nodes;
+  // save the output node of the network, value = topName,
+  // topName indicates the output name of the operator.
+  std::vector<std::string> user_out_nodes_top_vec;
   // net out nodes (where user_out_nodes or leaf nodes)
   std::vector<std::string> net_out_nodes;
   // net out nodes top names(only caffe has top)
diff --git a/inc/graph/debug/ge_attr_define.h b/inc/graph/debug/ge_attr_define.h
index 7538ba6a..47b11ba8 100644
--- a/inc/graph/debug/ge_attr_define.h
+++ b/inc/graph/debug/ge_attr_define.h
@@ -1052,6 +1052,10 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_OP_DEBUG_FLAG;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_OP_DEBUG_MODE;
 
+// op dynamic input
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DYNAMIC_INPUT_START;
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_DYNAMIC_INPUT_END;
+
 // functional ops attr
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IF_THEN_BRANCH;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IF_ELSE_BRANCH;
diff --git a/inc/graph/op_desc.h b/inc/graph/op_desc.h
index c7da30b7..4d724c42 100644
--- a/inc/graph/op_desc.h
+++ b/inc/graph/op_desc.h
@@ -235,7 +235,8 @@ class OpDesc : public std::enable_shared_from_this<OpDesc>, public AttrHolder {
   vector<string> GetOpInferDepends() const;
 
   string GetInputNameByIndex(uint32_t index) const;
-
+  string GetValidInputNameByIndex(uint32_t index) const;
+  int GetValidInputIndexByName(const string &name) const;
   int GetInputIndexByName(const string &name) const;
 
   string GetOutputNameByIndex(uint32_t index) const;
diff --git a/inc/graph/range_vistor.h b/inc/graph/range_vistor.h
index 20905bd9..8635d413 100644
--- a/inc/graph/range_vistor.h
+++ b/inc/graph/range_vistor.h
@@ -22,8 +22,10 @@
 template <class E, class O>
 class RangeVistor {
  public:
+  /*lint -e151*/
   using Iterator = typename std::vector<E>::iterator;
   using ConstIterator = typename std::vector<E>::const_iterator;
+  /*lint +e151*/
 
   RangeVistor(O owner, const std::vector<E> &vs) : owner_(owner), elements_(vs) {}
 
@@ -41,7 +43,9 @@ class RangeVistor {
 
   bool empty() const { return elements_.empty(); }
 
+  /*lint -e659*/
   E &at(std::size_t index) { return elements_.at(index); }
+  /*lint +e659*/
 
   const E &at(std::size_t index) const { return elements_.at(index); }
 
diff --git a/inc/graph/utils/op_desc_utils.h b/inc/graph/utils/op_desc_utils.h
index 6a9a4695..daa95ebe 100644
--- a/inc/graph/utils/op_desc_utils.h
+++ b/inc/graph/utils/op_desc_utils.h
@@ -53,6 +53,7 @@ class OpDescUtils {
   static vector<GeTensorPtr> MutableWeights(const ge::NodePtr node);
   static graphStatus SetWeights(ge::Node& node, const vector<ge::GeTensorPtr>& weights);
   static graphStatus SetWeights(ge::NodePtr node, const vector<ge::GeTensorPtr>& weights);
+  static graphStatus SetWeights(ge::Node& node, const map<int, ge::GeTensorPtr>& weights_map);
   static graphStatus ClearWeights(ge::NodePtr node);
 
   static bool ClearInputDesc(ge::OpDescPtr op_desc, uint32_t index);
diff --git a/src/common/graph/detail/attributes_holder.cc b/src/common/graph/detail/attributes_holder.cc
index 113f4b6f..7e3b6de9 100644
--- a/src/common/graph/detail/attributes_holder.cc
+++ b/src/common/graph/detail/attributes_holder.cc
@@ -28,7 +28,7 @@ using std::unordered_set;
 void AttrHolder::CopyAttrsFrom(const AttrHolder &holder) { MutableAttrMap().CopyValueFrom(holder.GetAttrMap()); }
 graphStatus AttrHolder::SetAttr(const std::string &name, const GeAttrValue &value) {
   if (value.IsEmpty()) {
-    GELOGE(GRAPH_FAILED, "value is empty, key %s", name.c_str());
+    GELOGE(GRAPH_FAILED, "value is empty, key of the attr is %s", name.c_str());
     return GRAPH_FAILED;
   }
   auto proto_map = MutableAttrMap().GetProtoMsg();
diff --git a/src/common/graph/ge_attr_define.cc b/src/common/graph/ge_attr_define.cc
index cd504812..9b723bb3 100644
--- a/src/common/graph/ge_attr_define.cc
+++ b/src/common/graph/ge_attr_define.cc
@@ -1060,6 +1060,10 @@ const std::string ATTR_NAME_HCCL_FUSED_FLAG = "_hccl_fused_node";
 const std::string ATTR_DYNAMIC_SHAPE_FIXED_ADDR = "_alloc_fixed_addr";
 const std::string ATTR_DYNAMIC_SHAPE_FIXED_ADDR_INDEX = "_alloc_fixed_addr_index";
 
+// op dynamic input
+const std::string ATTR_NAME_DYNAMIC_INPUT_START = "_dynamic_input_index_start";
+const std::string ATTR_NAME_DYNAMIC_INPUT_END = "_dynamic_input_index_end";
+
 // atc user def dtype&format
 const std::string ATTR_ATC_USER_DEFINE_DATATYPE = "_user_defined_data_type";
 const std::string ATTR_ATC_USER_DEFINE_FORMAT = "_user_defined_format";
diff --git a/src/common/graph/node.cc b/src/common/graph/node.cc
index 10d6b3ed..d33c6008 100644
--- a/src/common/graph/node.cc
+++ b/src/common/graph/node.cc
@@ -762,9 +762,10 @@ graphStatus Node::Verify() const {
   if (!is_unknown_graph) {
     for (const auto &in_anchor_ptr : GetAllInDataAnchors()) {
       GE_IF_BOOL_EXEC(in_anchor_ptr == nullptr, GELOGW("in anchor ptr is null"); continue);
-      bool valid_anchor = op_->GetType() == data_type || op_->GetType() == aipp_data_type ||
-                          op_->GetType() == const_type || op_->GetType() == variable_type ||
-                          op_->IsOptionalInput(in_anchor_ptr->GetIdx()) || in_anchor_ptr->GetPeerAnchors().size() > 0;
+      bool valid_anchor =
+        op_->GetType() == data_type || op_->GetType() == aipp_data_type || op_->GetType() == const_type ||
+        op_->GetType() == variable_type || op_->IsOptionalInput(in_anchor_ptr->GetIdx()) ||
+        op_->MutableInputDesc(in_anchor_ptr->GetIdx()) == nullptr || in_anchor_ptr->GetPeerAnchors().size() > 0;
       if (!valid_anchor) {
         ErrorManager::GetInstance().ATCReportErrMessage("E11019", {"opname", "index"},
                                                         {GetName(), std::to_string(in_anchor_ptr->GetIdx())});
diff --git a/src/common/graph/op_desc.cc b/src/common/graph/op_desc.cc
index fdd1acb7..dee0aece 100644
--- a/src/common/graph/op_desc.cc
+++ b/src/common/graph/op_desc.cc
@@ -347,7 +347,10 @@ graphStatus OpDesc::AddOptionalInputDesc(const string &name, const ge::GeTensorD
 
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
 OpDesc::UpdateInputDesc(uint32_t index, const ge::GeTensorDesc &tensor_Desc) {
-  GE_CHK_BOOL_RET_STATUS((index < inputs_desc_.size()), GRAPH_FAILED, "The index is invalid. index[%u]", index);
+  if (index >= inputs_desc_.size()) {
+    GELOGW("The index is invalid. index[%u]", index);
+    return GRAPH_FAILED;
+  }
 
   inputs_desc_[index] = ComGraphMakeShared<GeTensorDesc>(tensor_Desc);
   if (inputs_desc_[index] == nullptr) {
@@ -949,6 +952,43 @@ int OpDesc::GetInputIndexByName(const string &name) const {
   return static_cast<int>(it_find->second);
 }
 
+int OpDesc::GetValidInputIndexByName(const string &name) const {
+  map<string, uint32_t> valid_input_name_idx{};
+  uint32_t j = 0;
+  for (size_t i = 0; i < GetAllInputsSize(); i++) {
+    if (MutableInputDesc(static_cast<uint32_t>(i)) != nullptr) {
+      auto valid_name = GetInputNameByIndex(static_cast<uint32_t>(i));
+      GE_CHK_BOOL_RET_STATUS_NOLOG(!valid_name.empty(), -1);
+      valid_input_name_idx.insert({valid_name, j});
+      j++;
+    }
+  }
+  auto it_find = valid_input_name_idx.find(name);
+  GE_CHK_BOOL_RET_STATUS_NOLOG(it_find != valid_input_name_idx.end(), -1);
+  return static_cast<int>(it_find->second);
+}
+
+string OpDesc::GetValidInputNameByIndex(uint32_t index) const {
+  map<string, uint32_t> valid_input_name_idx{};
+  uint32_t j = 0;
+  for (size_t i = 0; i < GetAllInputsSize(); i++) {
+    if (MutableInputDesc(static_cast<uint32_t>(i)) != nullptr) {
+      auto valid_name = GetInputNameByIndex(static_cast<uint32_t>(i));
+      GE_CHK_BOOL_RET_STATUS_NOLOG(!valid_name.empty(), "");
+      valid_input_name_idx.insert({valid_name, j});
+      j++;
+    }
+  }
+  auto it = valid_input_name_idx.begin();
+  for (; it != valid_input_name_idx.end(); ++it) {
+    if (it->second == index) {
+      break;
+    }
+  }
+  GE_CHK_BOOL_RET_STATUS_NOLOG(it != valid_input_name_idx.end(), "");
+  return it->first;
+}
+
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY string OpDesc::GetOutputNameByIndex(uint32_t index) const {
   auto it = output_name_idx_.begin();
   for (; it != output_name_idx_.end(); ++it) {
diff --git a/src/common/graph/ref_relation.cc b/src/common/graph/ref_relation.cc
index 9a9f66ba..48e136fb 100644
--- a/src/common/graph/ref_relation.cc
+++ b/src/common/graph/ref_relation.cc
@@ -56,7 +56,7 @@ class RefRelations::Impl {
       }
       return GRAPH_SUCCESS;
     }
-    GELOGW("can not find any relations! key value is %s", lookup_key.c_str());
+    GELOGW("can not find any relations! key value of dest relation is %s", lookup_key.c_str());
     return GRAPH_SUCCESS;
   };
   graphStatus BuildRefRelations(ge::ComputeGraph &root_graph);
diff --git a/src/common/graph/utils/op_desc_utils.cc b/src/common/graph/utils/op_desc_utils.cc
index 63fff177..17c80b2c 100644
--- a/src/common/graph/utils/op_desc_utils.cc
+++ b/src/common/graph/utils/op_desc_utils.cc
@@ -560,6 +560,53 @@ OpDescUtils::SetWeights(ge::Node &node, const vector<ge::GeTensorPtr> &weights)
   return GRAPH_SUCCESS;
 }
 
+GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
+OpDescUtils::SetWeights(ge::Node &node, const map<int, ge::GeTensorPtr> &weights_map) {
+  GE_CHECK_NOTNULL(node.GetOpDesc());
+  // 1. node is const
+  if (node.GetOpDesc()->GetType() == CONSTANT) {
+    if (weights_map.size() == CONST_OP_NORMAL_WEIGHT_SIZE) {
+      return SetWeights(node.GetOpDesc(), weights_map.begin()->second);
+    }
+    GELOGE(GRAPH_PARAM_INVALID, "const op %s weight size %zu should be 1", node.GetName().c_str(), weights_map.size());
+    return GRAPH_PARAM_INVALID;
+  }
+  // 2. node is not const
+  for (const auto &pair : weights_map) {
+    auto in_data_anchor = node.GetInDataAnchor(pair.first);
+    if (in_data_anchor != nullptr && in_data_anchor->GetPeerOutAnchor() != nullptr) {
+      // a. update const input node
+      auto out_anchor = in_data_anchor->GetPeerOutAnchor();
+      auto peer_node = out_anchor->GetOwnerNode();
+      if (peer_node == nullptr) {
+        GELOGE(GRAPH_PARAM_INVALID, "op %s [%d]'s input node is null", node.GetName().c_str(), pair.first);
+        return GRAPH_PARAM_INVALID;
+      }
+      if (peer_node->GetType() != CONSTANT) {
+        GELOGE(GRAPH_PARAM_INVALID, " op %s [%d]'s input node should be const, but is %s type:%s ",
+               node.GetName().c_str(), pair.first, peer_node->GetName().c_str(), peer_node->GetType().c_str());
+      }
+      SetWeights(peer_node->GetOpDesc(), pair.second);
+    } else {
+      // b. create new const input node
+      auto const_opdesc = CreateConstOp(pair.second);
+      GE_CHECK_NOTNULL(const_opdesc);
+      auto owner_graph = node.GetOwnerComputeGraph();
+      if (owner_graph == nullptr) {
+        GELOGE(GRAPH_PARAM_INVALID, "node's graph is empty, name: %s", node.GetName().c_str());
+        return GRAPH_PARAM_INVALID;
+      }
+      auto const_node = owner_graph->AddNodeFront(const_opdesc);
+      if (node.AddLinkFrom(static_cast<uint32_t>(pair.first), const_node) != GRAPH_SUCCESS) {
+        GELOGE(GRAPH_FAILED, "op %s add const to input index[%d] failed", node.GetName().c_str(), pair.first);
+        return GRAPH_FAILED;
+      }
+    }
+  }
+  NodeUtils::UpdateIsInputConst(node);
+  return GRAPH_SUCCESS;
+}
+
 OpDescPtr OpDescUtils::CreateConstOp(const GeTensorPtr &tensor_ptr) {
   GE_CHK_BOOL_EXEC(tensor_ptr != nullptr, return nullptr, "tensor_ptr is nullptr!");
   shared_ptr<OpDesc> const_opdesc = ComGraphMakeShared<OpDesc>();
diff --git a/src/ge/CMakeLists.txt b/src/ge/CMakeLists.txt
index db00d8a1..3f4f1a8b 100755
--- a/src/ge/CMakeLists.txt
+++ b/src/ge/CMakeLists.txt
@@ -229,6 +229,7 @@ target_link_libraries(ge_runner
         ${resouce}
         ${ascend_hal}
         ${adump_server}
+        ${msprofiler}
         rt
         dl)
 
@@ -358,7 +359,10 @@ add_library(ge_compiler SHARED ${INFER_SRC_LIST} ${PROTO_SRCS} ${PROTO_HEADER_HD
 target_compile_definitions(ge_compiler PRIVATE
         PROTOBUF_INLINE_NOT_IN_HEADERS=0
         REUSE_MEMORY=1
-        FMK_HOST_INFER)
+        FMK_HOST_INFER
+        FMK_SUPPORT_DUMP
+        COMPILE_OMG_PACKAGE
+        REUSE_MEMORY=1)
 target_link_libraries(ge_compiler
         graph
         ge_common
diff --git a/src/ge/client/CMakeLists.txt b/src/ge/client/CMakeLists.txt
index a87beb77..b568e3f6 100755
--- a/src/ge/client/CMakeLists.txt
+++ b/src/ge/client/CMakeLists.txt
@@ -68,5 +68,7 @@ target_link_libraries(ge_client
         ${mmpa}
         ${runtime}
         ${msprof}
+        ${msprofiler}
+        ${ascend_hal}
         rt
         dl)
diff --git a/src/ge/client/ge_api.cc b/src/ge/client/ge_api.cc
index ad01e48f..7c4cf9c8 100644
--- a/src/ge/client/ge_api.cc
+++ b/src/ge/client/ge_api.cc
@@ -16,6 +16,7 @@
 
 #include "ge/ge_api.h"
 #include <iostream>
+#include <malloc.h>
 #include "common/debug/log.h"
 #include "framework/common/debug/ge_log.h"
 #include "common/ge/datatype_util.h"
@@ -163,6 +164,9 @@ Status GEFinalize() {
     g_ge_initialized = false;
   }
 
+  // to avoid memory fragment, use malloc_trim to back free stack to system
+  malloc_trim(0);
+
   GELOGT(TRACE_STOP, "GEFinalize finished");
   return ret;
 }
diff --git a/src/ge/client/module.mk b/src/ge/client/module.mk
index 1a304cbf..476841c9 100644
--- a/src/ge/client/module.mk
+++ b/src/ge/client/module.mk
@@ -70,9 +70,10 @@ LOCAL_SHARED_LIBRARIES := \
     libregister \
     libge_compiler \
     libge_common \
-    libmsprof
-
+    libmsprof \
+    stub/libascend_hal
 
+LOCAL_STATIC_LIBRARIES := libmsprofiler
 
 LOCAL_LDFLAGS := -lrt -ldl
 
@@ -107,6 +108,7 @@ LOCAL_SHARED_LIBRARIES := \
     libge_common \
     libmsprof
 
+LOCAL_STATIC_LIBRARIES := libmsprofiler
 
 LOCAL_LDFLAGS := -lrt -ldl
 LOCAL_CFLAGS += \
diff --git a/src/ge/common/dump/dump_op.cc b/src/ge/common/dump/dump_op.cc
index 31a88023..8c4ff330 100644
--- a/src/ge/common/dump/dump_op.cc
+++ b/src/ge/common/dump/dump_op.cc
@@ -172,18 +172,18 @@ Status DumpOp::ExecutorDumpOp(aicpu::dump::OpMappingInfo &op_mapping_info) {
     return RT_FAILED;
   }
 
-  constexpr int32_t ioAddrNum = 2;
-  constexpr uint32_t argsSize = sizeof(aicpu::AicpuParamHead) + ioAddrNum * sizeof(uint64_t);
-  char args[argsSize] = {0};
-  auto paramHead = reinterpret_cast<aicpu::AicpuParamHead *>(args);
-  paramHead->length = argsSize;
-  paramHead->ioAddrNum = ioAddrNum;
-  auto ioAddr = reinterpret_cast<uint64_t *>(args + sizeof(aicpu::AicpuParamHead));
-  ioAddr[0] = reinterpret_cast<uintptr_t>(proto_dev_mem_);
-  ioAddr[1] = reinterpret_cast<uintptr_t>(proto_size_dev_mem_);
+  constexpr int32_t io_addr_num = 2;
+  constexpr uint32_t args_size = sizeof(aicpu::AicpuParamHead) + io_addr_num * sizeof(uint64_t);
+  char args[args_size] = {0};
+  auto param_head = reinterpret_cast<aicpu::AicpuParamHead *>(args);
+  param_head->length = args_size;
+  param_head->ioAddrNum = io_addr_num;
+  auto io_addr = reinterpret_cast<uint64_t *>(args + sizeof(aicpu::AicpuParamHead));
+  io_addr[0] = reinterpret_cast<uintptr_t>(proto_dev_mem_);
+  io_addr[1] = reinterpret_cast<uintptr_t>(proto_size_dev_mem_);
   rt_ret = rtCpuKernelLaunch(nullptr, kDumpKernelsDumpOp,
                              1,  // blockDim default 1
-                             args, argsSize,
+                             args, args_size,
                              nullptr,  // no need smDesc
                              stream_);
   if (rt_ret != RT_ERROR_NONE) {
diff --git a/src/ge/common/ge/datatype_util.cc b/src/ge/common/ge/datatype_util.cc
index f2ff12cb..79a473fe 100644
--- a/src/ge/common/ge/datatype_util.cc
+++ b/src/ge/common/ge/datatype_util.cc
@@ -34,7 +34,7 @@ std::map<ge::DataType, std::vector<ge::DataType>> g_reverse_translatable_data_ty
   {ge::DT_INT32, {ge::DT_BOOL, ge::DT_INT64}},
   {ge::DT_FLOAT, {ge::DT_FLOAT16, ge::DT_FLOAT}}};
 
-static const std::map<ge::DataType, ge::proto::DataType> g_dump_data_type_map = {
+std::map<ge::DataType, ge::proto::DataType> g_dump_data_type_map = {
   // key:ge datatype,value:proto datatype
   {ge::DT_UNDEFINED, ge::proto::DT_UNDEFINED},
   {ge::DT_FLOAT, ge::proto::DT_FLOAT},
diff --git a/src/ge/common/profiling/profiling_manager.cc b/src/ge/common/profiling/profiling_manager.cc
index d02f7e8f..9492045c 100644
--- a/src/ge/common/profiling/profiling_manager.cc
+++ b/src/ge/common/profiling/profiling_manager.cc
@@ -51,12 +51,13 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager &ProfilingMana
   return profiling_manager;
 }
 
-FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::Init(const Options &options,
-                                                                                   bool convert_2_phy_device_id) {
+FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::Init(const Options &options) {
 #ifdef DAVINCI_SUPPORT_PROFILING
   vector<int32_t>().swap(device_id_);
   job_id_ = options.job_id;
 
+  GELOGI("ProfilingManager::Init  job_id:%s", job_id_.c_str());
+
   Status ret;
   if (!recv_profiling_config_.empty()) {
     GELOGI("Profiling json config from acl:%s", recv_profiling_config_.c_str());
@@ -64,18 +65,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In
   } else {
     ret = InitFromOptions(options);
     if (ret == SUCCESS && is_load_profiling_) {
-      // profiling need phy device id
-      if (!convert_2_phy_device_id) {
-        device_id_.push_back(options.device_id);
-      } else {
-        uint32_t phy_device_id = 0;
-        rtError_t rt_ret = rtGetDevicePhyIdByIndex(static_cast<uint32_t>(options.device_id), &phy_device_id);
-        if (rt_ret != RT_ERROR_NONE) {
-          GELOGE(rt_ret, "runtime get phy_device_id failed, current phy_device_id:%u", phy_device_id);
-          return FAILED;
-        }
-        device_id_.push_back(phy_device_id);
-      }
+      device_id_.push_back(options.device_id);
     }
   }
   if (ret != SUCCESS) {
@@ -557,25 +547,17 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY void ProfilingManager::ReportPr
     return;
   }
   GELOGI("current logic_device_id:%d", logic_device_id);
-
-  uint32_t phy_device_id = 0;
-  rt_ret = rtGetDevicePhyIdByIndex((uint32_t)logic_device_id, &phy_device_id);
-  if (rt_ret != RT_ERROR_NONE) {
-    GELOGE(rt_ret, "runtime get phy_device_id failed, current phy_device_id:%d", phy_device_id);
-    return;
-  }
-  GELOGI("current phy_device_id:%d", phy_device_id);
   if (!is_acl_api_mode_) {
-    auto ret = std::find(device_id_.begin(), device_id_.end(), phy_device_id);
+    auto ret = std::find(device_id_.begin(), device_id_.end(), logic_device_id);
     if (ret == device_id_.end()) {
       GELOGE(FAILED, "get valid phy_device_id failed, profiling report failed.");
       return;
     }
   }
   GELOGI("start ProfilingTaskDescInfo.");
-  ProfilingTaskDescInfo(task_desc_info, phy_device_id);
+  ProfilingTaskDescInfo(task_desc_info, logic_device_id);
   GELOGI("start ProfilingGraphDescInfo.");
-  ProfilingGraphDescInfo(compute_graph_desc_info, phy_device_id);
+  ProfilingGraphDescInfo(compute_graph_desc_info, logic_device_id);
   GELOGI("Report profiling data for GE end.");
 #endif
 }
diff --git a/src/ge/common/profiling/profiling_manager.h b/src/ge/common/profiling/profiling_manager.h
index f4249451..a030efd3 100644
--- a/src/ge/common/profiling/profiling_manager.h
+++ b/src/ge/common/profiling/profiling_manager.h
@@ -69,7 +69,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ProfilingManager {
   ProfilingManager();
   virtual ~ProfilingManager();
   static ProfilingManager &Instance();
-  ge::Status Init(const Options &options, bool convert_2_phy_device_id = false);
+  ge::Status Init(const Options &options);
   ge::Status InitFromOptions(const Options &options);
   ge::Status InitFromAclCfg(const std::string &config);
   ge::Status StartProfiling(int32_t iter, int32_t device_id);
diff --git a/src/ge/common/util.cc b/src/ge/common/util.cc
index ce5aa57e..4adf3ebd 100644
--- a/src/ge/common/util.cc
+++ b/src/ge/common/util.cc
@@ -472,7 +472,7 @@ FMK_FUNC_HOST_VISIBILITY bool ValidateStr(const std::string &str, const std::str
     return true;
   }
 
-  ret = regexec(&reg, str.c_str(), 0, nullptr, 0);
+  ret = regexec(&reg, str.c_str(), 0, NULL, 0);
   if (ret) {
     regerror(ret, &reg, ebuff, kMaxBuffSize);
     GELOGE(ge::PARAM_INVALID, "regexec failed, reason: %s", ebuff);
diff --git a/src/ge/executor/CMakeLists.txt b/src/ge/executor/CMakeLists.txt
index 7358585a..b68507bd 100755
--- a/src/ge/executor/CMakeLists.txt
+++ b/src/ge/executor/CMakeLists.txt
@@ -120,6 +120,7 @@ target_link_libraries(ge_executor
         ${mmpa}
         ${msprof}
         ${error_manager}
+        ${ascend_hal}
         rt
         dl)
 
diff --git a/src/ge/executor/module.mk b/src/ge/executor/module.mk
index bb642da9..1c3efe4c 100644
--- a/src/ge/executor/module.mk
+++ b/src/ge/executor/module.mk
@@ -89,6 +89,7 @@ local_ge_executor_shared_library :=        \
     libregister                            \
     libmsprof                              \
     liberror_manager                       \
+    libascend_hal
 
 local_ge_executor_ldflags := -lrt -ldl     \
 
@@ -104,6 +105,7 @@ LOCAL_SRC_FILES := $(local_ge_executor_src_files)
 LOCAL_C_INCLUDES := $(local_ge_executor_c_include)
 
 LOCAL_SHARED_LIBRARIES := $(local_ge_executor_shared_library)
+LOCAL_STATIC_LIBRARIES := libmsprofiler
 ifeq ($(device_os),android)
 LOCAL_LDFLAGS += -ldl
 LOCAL_LDLIBS += -L$(PWD)/prebuilts/clang/linux-x86/aarch64/android-ndk-r21/sysroot/usr/lib/aarch64-linux-android/29 -llog
@@ -140,6 +142,9 @@ LOCAL_SHARED_LIBRARIES :=                  \
     libregister                            \
     libmsprof                              \
     liberror_manager                       \
+    stub/libascend_hal
+
+LOCAL_STATIC_LIBRARIES := libmsprofiler
 
 LOCAL_LDFLAGS += $(local_ge_executor_ldflags)
 
diff --git a/src/ge/ge_inference.mk b/src/ge/ge_inference.mk
index 232e79ec..621e42c5 100644
--- a/src/ge/ge_inference.mk
+++ b/src/ge/ge_inference.mk
@@ -355,7 +355,7 @@ LOCAL_MODULE := libge_compiler
 
 LOCAL_CFLAGS += -DPROTOBUF_INLINE_NOT_IN_HEADERS=0 -DREUSE_MEMORY=1 -O2
 # from ome_inference.mk
-LOCAL_CFLAGS += -DFMK_HOST_INFER -DFMK_SUPPORT_DUMP
+LOCAL_CFLAGS += -DFMK_HOST_INFER -DFMK_SUPPORT_DUMP -DCOMPILE_OMG_PACKAGE
 ifeq ($(DEBUG), 1)
 LOCAL_CFLAGS += -g -O0
 endif
@@ -418,7 +418,7 @@ include $(CLEAR_VARS)
 LOCAL_MODULE := libge_compiler
 LOCAL_CFLAGS += -DGOOGLE_PROTOBUF_NO_RTTI -DDEV_VISIBILITY -DNONSUPPORT_SAVE_TO_FILE
 LOCAL_CFLAGS += -DPROTOBUF_INLINE_NOT_IN_HEADERS=0
-LOCAL_CFLAGS += -DREUSE_MEMORY=1 -DFMK_SUPPORT_DUMP
+LOCAL_CFLAGS += -DREUSE_MEMORY=1 -DFMK_SUPPORT_DUMP -DCOMPILE_OMG_PACKAGE
 LOCAL_CFLAGS += -DOMG_DEVICE_VERSION
 LOCAL_CFLAGS += -O2
 LOCAL_MODULE_CLASS := SHARED_LIBRARIES
diff --git a/src/ge/ge_local_engine/CMakeLists.txt b/src/ge/ge_local_engine/CMakeLists.txt
index e685c301..bcbc3e4c 100755
--- a/src/ge/ge_local_engine/CMakeLists.txt
+++ b/src/ge/ge_local_engine/CMakeLists.txt
@@ -42,7 +42,7 @@ include_directories(${CMAKE_BINARY_DIR}/proto/ge)
 
 ######### libge_local_engine.so #############
 add_library(ge_local_engine SHARED ${SRC_LIST} ${PROTO_HDRS})
-target_compile_definitions(ge_local_engine PRIVATE Werror)
+target_compile_definitions(ge_local_engine PRIVATE Werror COMPILE_OMG_PACKAGE)
 target_link_libraries(ge_local_engine
         graph
         ${PROTOBUF_LIBRARY}
diff --git a/src/ge/ge_local_engine/module.mk b/src/ge/ge_local_engine/module.mk
index ee6b15c1..3307f780 100644
--- a/src/ge/ge_local_engine/module.mk
+++ b/src/ge/ge_local_engine/module.mk
@@ -42,7 +42,7 @@ include ${BUILD_HOST_SHARED_LIBRARY}
 include $(CLEAR_VARS)
 LOCAL_MODULE := atclib/libge_local_engine
 LOCAL_CFLAGS += -Werror
-LOCAL_CFLAGS += -std=c++11
+LOCAL_CFLAGS += -std=c++11 -DCOMPILE_OMG_PACKAGE
 LOCAL_LDFLAGS :=
 
 LOCAL_STATIC_LIBRARIES :=
diff --git a/src/ge/ge_runner.mk b/src/ge/ge_runner.mk
index 04182070..956bab0b 100644
--- a/src/ge/ge_runner.mk
+++ b/src/ge/ge_runner.mk
@@ -356,6 +356,7 @@ LOCAL_SRC_FILES += $(LIBCLIENT_LOCAL_SRC_FILES)
 
 LOCAL_STATIC_LIBRARIES := libge_memory \
                           libadump_server \
+                          libmsprofiler \
 
 LOCAL_SHARED_LIBRARIES := \
     libc_sec \
diff --git a/src/ge/generator/ge_generator.cc b/src/ge/generator/ge_generator.cc
index edd7a155..bef93333 100644
--- a/src/ge/generator/ge_generator.cc
+++ b/src/ge/generator/ge_generator.cc
@@ -136,6 +136,13 @@ static Status AddInputs(const ComputeGraphPtr &graph, const NodePtr &node, GeTen
                         bool attr) {
   GE_CHECK_NOTNULL_EXEC(graph, return PARAM_INVALID);
   GE_CHECK_NOTNULL_EXEC(node, return PARAM_INVALID);
+
+  auto format = tensor.GetFormat();
+  auto data_type = tensor.GetDataType();
+  if (format == FORMAT_RESERVED && data_type == DT_UNDEFINED) {
+    return SUCCESS;
+  }
+
   string op_type;
   if (!AttrUtils::GetStr(tensor, kAttrOpType, op_type) || op_type.empty()) {
     op_type = DATA;
@@ -521,8 +528,8 @@ Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &in
                                   const string &model_file_name, OpEngineType engine_type, ModelBufferData &model_buff,
                                   bool is_offline) {
   GE_CHECK_NOTNULL_EXEC(op_desc, return PARAM_INVALID);
-  if (!inputs.empty() && (inputs.size() != op_desc->GetInputsSize())) {
-    GELOGE(PARAM_INVALID, "Tensor size: %zu, Inputs size: %zu", inputs.size(), op_desc->GetInputsSize());
+  if (!inputs.empty() && (inputs.size() != op_desc->GetAllInputsSize())) {
+    GELOGE(PARAM_INVALID, "Tensor size: %zu, Inputs size: %zu", inputs.size(), op_desc->GetAllInputsSize());
     return PARAM_INVALID;
   }
   if (!outputs.empty() && (outputs.size() != op_desc->GetOutputsSize())) {
diff --git a/src/ge/graph/build/memory/graph_mem_assigner.cc b/src/ge/graph/build/memory/graph_mem_assigner.cc
index 1518714f..1cdb2efa 100644
--- a/src/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/src/ge/graph/build/memory/graph_mem_assigner.cc
@@ -322,11 +322,19 @@ Status GraphMemoryAssigner::ReAssignContinuousMemory(bool is_loop_graph) {
           GELOGE(ge::FAILED,
                  "There is an atomic conflict between the current node and the peer out node, not supported!");
           return ge::FAILED;
-        } else if (is_loop_graph) {
-          GE_CHK_STATUS_RET(SetLoopGraphAtomicAttr(node, mem_clean_start));
-        } else {
-          GE_CHK_STATUS_RET(SetAtomicCleanAttr(nullptr, {mem_clean_start}, {mem_clean_size}),
-                            "SetAtomicCleanAttr failed.");
+        }
+
+        const auto &in_control_anchor = node->GetInControlAnchor();
+        GE_CHECK_NOTNULL(in_control_anchor);
+        for (const auto &peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) {
+          auto peer_out_node = peer_out_control_anchor->GetOwnerNode();
+          if (peer_out_node->GetType() == ATOMICADDRCLEAN) {
+            ret = SetAtomicCleanAttr(peer_out_node, {mem_clean_start}, {mem_clean_size});
+            if (ret != SUCCESS) {
+              GELOGE(ret, "Failed to set attr for atomic addr clean node %s.", peer_out_node->GetName().c_str());
+              return ret;
+            }
+          }
         }
       }
     }
@@ -840,68 +848,37 @@ Status GraphMemoryAssigner::ReAssignVirtualNodesMemory(map<string, vector<NodePt
 }
 
 Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) {
-  GE_CHECK_NOTNULL(compute_graph_);
-  // Atomic op memory start addr
-  int64_t atomic_mem_start = static_cast<int64_t>(memory_offset_[0].mem_offset_);
-  GELOGI("Begin to reAssign atomic memory, atomic initial address mem_offset = %zu!", memory_offset_[0].mem_offset_);
-
-  vector<NodePtr> connect_netoutput_nodes;
-  for (auto &node : compute_graph_->GetAllNodes()) {
-    auto node_op_desc = node->GetOpDesc();
-    if (node_op_desc == nullptr) {
-      continue;
-    }
-
-    bool is_atomic = false;
-    // If GetBool fail, is_atomic is false.
-    (void)ge::AttrUtils::GetBool(node_op_desc, ATOMIC_ATTR_IS_ATOMIC_NODE, is_atomic);
-    if (!is_atomic) {
-      continue;
-    }
-
-    bool is_ref = false;
-    // If GetBool fail, is_ref is false.
-    (void)ge::AttrUtils::GetBool(node_op_desc, ATTR_NAME_REFERENCE, is_ref);
-    if (is_ref) {
-      GELOGE(ge::PARAM_INVALID, "The node %s cannot have both atomic and ref attribute.",
-             node_op_desc->GetName().c_str());
-      return ge::PARAM_INVALID;
-    }
-
-    vector<int> is_connect_netoutput;
-    // If GetBool fail, attr is_connect_netoutput is an empty vector.
-    (void)ge::AttrUtils::GetListInt(node_op_desc, ATTR_NAME_NODE_CONNECT_OUTPUT, is_connect_netoutput);
-    if (!is_connect_netoutput.empty()) {
-      connect_netoutput_nodes.emplace_back(node);
-      continue;
-    }
+  map<NodePtr, vector<NodePtr>> normal_atomic_and_clean_nodes_map;
+  vector<NodePtr> connecting_output_atomic_nodes;
+  Status status = FilterAtomicNodesForMemoryAssign(normal_atomic_and_clean_nodes_map, connecting_output_atomic_nodes);
+  if (status != SUCCESS) {
+    GELOGE(status, "Failed to filter atomic nodes for memory assignment.");
+    return status;
+  }
 
-    // Atomic op memory start addr of loop graph
-    int64_t loop_graph_atomic_mem_start = static_cast<int64_t>(memory_offset_[0].mem_offset_);
-    vector<int64_t> mem_offset_end;
-    if (AssignAtomicOutputAndWorkspaceMemory(node, mem_offset_end) != SUCCESS) {
-      GELOGE(FAILED, "Assign atomic output and workspace memory failed, node is %s.", node->GetName().c_str());
-      return FAILED;
-    }
+  for (auto &iter : normal_atomic_and_clean_nodes_map) {
+    int64_t atomic_mem_start = static_cast<int64_t>(memory_offset_[0].mem_offset_);
+    GELOGD("Begin to reAssign atomic memory, atomic address memory start = %ld", atomic_mem_start);
 
-    /// In networks with loop op, atomic op uses atomic_addr_clean op independently,
-    /// so we need to set the attr separately.
-    if (is_loop_graph) {
-      GE_CHK_STATUS_RET(SetLoopGraphAtomicAttr(node, loop_graph_atomic_mem_start));
+    for (auto &atomic_node : iter.second) {
+      vector<int64_t> mem_offset_end;
+      status = AssignAtomicOutputAndWorkspaceMemory(atomic_node, mem_offset_end);
+      if (status != SUCCESS) {
+        GELOGE(status, "Assign atomic output and workspace memory failed, node name is %s.",
+               atomic_node->GetName().c_str());
+        return status;
+      }
     }
-  }
 
-  // In networks without loop op, the same atomic addr clean op is used for atomic op
-  if (!is_loop_graph) {
-    // Set the address attr of atomic clean operator
-    int64_t atomic_mem_size = memory_offset_[0].mem_offset_ - atomic_mem_start;
-    if (atomic_mem_size != 0) {
-      GE_CHK_STATUS_RET(SetAtomicCleanAttr(nullptr, {atomic_mem_start}, {atomic_mem_size}),
-                        "SetAtomicCleanAttr failed.");
+    int64_t atomic_mem_size = static_cast<int64_t>(memory_offset_[0].mem_offset_) - atomic_mem_start;
+    status = SetAtomicCleanAttr(iter.first, {atomic_mem_start}, {atomic_mem_size});
+    if (status != SUCCESS) {
+      GELOGE(status, "Failed to set attr for atomic addr clean node %s.", iter.first->GetName().c_str());
+      return status;
     }
   }
 
-  if (AssignConnectNetOutputAtomicMemory(connect_netoutput_nodes) != SUCCESS) {
+  if (AssignConnectNetOutputAtomicMemory(connecting_output_atomic_nodes) != SUCCESS) {
     GELOGE(FAILED, "Failed to assign memory of nodes that connect to netoutput.");
     return FAILED;
   }
@@ -909,6 +886,55 @@ Status GraphMemoryAssigner::ReAssignAtomicMemory(bool is_loop_graph) {
   return SUCCESS;
 }
 
+Status GraphMemoryAssigner::FilterAtomicNodesForMemoryAssign(map<NodePtr, vector<NodePtr>> &normal_atomic_nodes_map,
+                                                             vector<NodePtr> &connecting_output_atomic_nodes) {
+  GE_CHECK_NOTNULL(compute_graph_);
+  for (const auto &node : compute_graph_->GetAllNodes()) {
+    if (node->GetType() == ATOMICADDRCLEAN) {
+      vector<NodePtr> tmp_normal_atomic_nodes;
+      const auto &out_control_anchor = node->GetOutControlAnchor();
+      GE_CHECK_NOTNULL(out_control_anchor);
+      for (const auto &peer_in_control_anchor : out_control_anchor->GetPeerInControlAnchors()) {
+        if (peer_in_control_anchor != nullptr) {
+          auto peer_in_node = peer_in_control_anchor->GetOwnerNode();
+          auto peer_in_node_desc = peer_in_node->GetOpDesc();
+          if (peer_in_node_desc != nullptr) {
+            bool is_atomic_node = false;
+            // If GetBool fail, is_atomic_node is false.
+            (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATOMIC_ATTR_IS_ATOMIC_NODE, is_atomic_node);
+            if (is_atomic_node) {
+              bool is_reference = false;
+              // If GetBool fail, is_reference is false.
+              (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATTR_NAME_REFERENCE, is_reference);
+              if (is_reference) {
+                GELOGE(ge::PARAM_INVALID, "The node %s cannot have both atomic and is_reference attribute.",
+                       peer_in_node_desc->GetName().c_str());
+                return ge::PARAM_INVALID;
+              }
+
+              vector<int> is_connecting_output;
+              // If GetBool fail, attr is_connecting_output is an empty vector.
+              (void)ge::AttrUtils::GetListInt(peer_in_node_desc, ATTR_NAME_NODE_CONNECT_OUTPUT, is_connecting_output);
+              if (is_connecting_output.empty()) {
+                tmp_normal_atomic_nodes.emplace_back(peer_in_node);
+                continue;
+              }
+              connecting_output_atomic_nodes.emplace_back(peer_in_node);
+              tmp_normal_atomic_nodes.clear();
+              break;
+            }
+          }
+        }
+      }
+
+      if (!tmp_normal_atomic_nodes.empty()) {
+        normal_atomic_nodes_map[node] = tmp_normal_atomic_nodes;
+      }
+    }
+  }
+  return SUCCESS;
+}
+
 Status GraphMemoryAssigner::AssignAtomicOutputAndWorkspaceMemory(const ge::NodePtr &node,
                                                                  vector<int64_t> &mem_offset_end) {
   auto node_op_desc = node->GetOpDesc();
@@ -1331,6 +1357,7 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector<
   vector<int64_t> memory_type;
   auto tmp_op_desc = node->GetOpDesc();
   origin_input_list = tmp_op_desc->GetInputOffset();
+  int64_t valid_input_index = 0;
   bool has_mem_type_attr = ge::AttrUtils::GetListInt(tmp_op_desc, ATTR_NAME_INPUT_MEM_TYPE_LIST, memory_type);
   for (const auto &anchor : node->GetAllInDataAnchors()) {
     vector<int64_t> output_list;
@@ -1344,8 +1371,9 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector<
     auto last_peer_out_op_desc = last_peer_out_node->GetOpDesc();
     GE_CHECK_NOTNULL(last_peer_out_op_desc);
     output_list = last_peer_out_op_desc->GetOutputOffset();
-    if (output_list.size() > static_cast<size_t>(peer_out_anchor->GetIdx())) {
-      auto input_index = anchor->GetIdx();
+    auto out_index = static_cast<unsigned long>(peer_out_anchor->GetIdx());
+    if (output_list.size() > static_cast<size_t>(out_index)) {
+      int64_t input_offset = output_list.at(out_index);
       if (has_mem_type_attr) {
         auto input_size = tmp_op_desc->GetInputsSize();
         auto ori_input_offset_list_size = origin_input_list.size();
@@ -1359,26 +1387,21 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector<
         }
         // not hbm keep orignal inputoffest
         // hbm inputoffset = original inputoffset + outputoffset
-        input_list.emplace_back(memory_type[input_index] == RT_MEMORY_L1
-                                  ? origin_input_list[input_index]
-                                  : origin_input_list[input_index] + output_list.at(peer_out_anchor->GetIdx()));
-        GELOGI("fuison: node[%s] input[%d] is set from node[%s] out index[%d] offset[%ld]",
-               tmp_op_desc->GetName().c_str(), input_index,
-               peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(), peer_out_anchor->GetIdx(),
-               input_list.back());
-      } else {
-        int64_t output_offset = output_list.at(peer_out_anchor->GetIdx());
-        const auto &in_node = GetKnownInputNode(peer_out_anchor->GetOwnerNode());
-        if (in_node->GetType() == CONSTANT) {
-          GeTensorDesc tensor_desc = tmp_op_desc->GetInputDesc(input_index);
-          GE_CHK_STATUS(TensorUtils::GetDataOffset(tensor_desc, output_offset));
-        }
-
-        GELOGI("node[%s] input[%d] is set from node[%s] out index[%d] offset[%ld]", tmp_op_desc->GetName().c_str(),
-               input_index, peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(), peer_out_anchor->GetIdx(),
-               output_offset);
-        input_list.emplace_back(output_offset);
+        input_offset = (memory_type[valid_input_index] == RT_MEMORY_L1
+                          ? origin_input_list[valid_input_index]
+                          : origin_input_list[valid_input_index] + output_list.at(out_index));
+      }
+      const auto &in_node = GetKnownInputNode(peer_out_anchor->GetOwnerNode());
+      if (in_node->GetType() == CONSTANT) {
+        GeTensorDesc tensor_desc = tmp_op_desc->GetInputDesc(static_cast<uint32_t>(anchor->GetIdx()));
+        GE_CHK_STATUS(TensorUtils::GetDataOffset(tensor_desc, input_offset));
       }
+
+      GELOGI("%s node[%s] input[%d] is set from node[%s] out index[%lu] offset[%ld]",
+             has_mem_type_attr == true ? "Fusion" : "", tmp_op_desc->GetName().c_str(), valid_input_index,
+             peer_out_anchor->GetOwnerNode()->GetOpDesc()->GetName().c_str(), out_index, input_offset);
+      input_list.emplace_back(input_offset);
+      valid_input_index++;
     }
   }
   return ge::SUCCESS;
@@ -1473,125 +1496,49 @@ Status GraphMemoryAssigner::SetIndependentAtomicAttr(const ge::NodePtr &node, in
   return SUCCESS;
 }
 
-Status GraphMemoryAssigner::SetLoopGraphAtomicAttr(const ge::NodePtr &node, int64_t atomic_mem_start) {
-  // set the address attr of atomic clean operator for loop graph
-  int64_t atomic_mem_size = memory_offset_[0].mem_offset_ - atomic_mem_start;
-  GELOGI("SetLoopGraphAtomicAttr beign, atomic_addr_clean start size is %ld, mem_size is %ld, mem_offset is %zu.",
-         atomic_mem_start, atomic_mem_size, memory_offset_[0].mem_offset_);
-  const auto &in_control_anchor = node->GetInControlAnchor();
-  if (atomic_mem_size != 0 && in_control_anchor != nullptr) {
-    for (auto &peer_out_control_anchor : in_control_anchor->GetPeerOutControlAnchors()) {
-      if (peer_out_control_anchor == nullptr) {
-        continue;
-      }
-      auto peer_out_node = peer_out_control_anchor->GetOwnerNode();
-      auto peer_out_node_desc = peer_out_node->GetOpDesc();
-      if (peer_out_node_desc == nullptr) {
-        continue;
-      }
-
-      GELOGD("SetLoopGraphAtomicAttr,  node is %s, op type is %s.", peer_out_node_desc->GetName().c_str(),
-             peer_out_node_desc->GetType().c_str());
-
-      if (peer_out_node_desc->GetType() == ATOMICADDRCLEAN) {
-        GE_CHK_STATUS_EXEC(SetAtomicCleanAttr(peer_out_node, {atomic_mem_start}, {atomic_mem_size}),
-                           GELOGE(FAILED, "SetAtomicCleanAttr failed.");
-                           return FAILED);
-      }
-    }
-  }
-  return SUCCESS;
-}
-
-ge::Status GraphMemoryAssigner::IsIndependentAtomicClean(const ge::NodePtr &node,
-                                                         bool &is_independent_atomic_clean_node) {
-  GE_CHECK_NOTNULL(node);
-  const auto &out_control_anchor = node->GetOutControlAnchor();
-  GE_CHECK_NOTNULL(out_control_anchor);
-  for (const auto &peer_in_control_anchor : out_control_anchor->GetPeerInControlAnchors()) {
-    if (peer_in_control_anchor != nullptr) {
-      auto peer_in_node = peer_in_control_anchor->GetOwnerNode();
-      auto peer_in_node_desc = peer_in_node->GetOpDesc();
-      if (peer_in_node_desc != nullptr) {
-        bool is_atomic_node = false;
-        // If GetBool fail, is_atomic_node is false.
-        (void)ge::AttrUtils::GetBool(peer_in_node_desc, ATOMIC_ATTR_IS_ATOMIC_NODE, is_atomic_node);
-        if (is_atomic_node) {
-          vector<int> is_connect_netoutput;
-          // If GetBool fail, attr is_connect_netoutput is an empty vector.
-          (void)ge::AttrUtils::GetListInt(peer_in_node_desc, ATTR_NAME_NODE_CONNECT_OUTPUT, is_connect_netoutput);
-          if (!is_connect_netoutput.empty()) {
-            GELOGD("Peer in node %s is independent atomic clean node", peer_in_node->GetName().c_str());
-            is_independent_atomic_clean_node = true;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  return SUCCESS;
-}
-
-ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &n, const vector<int64_t> &atomic_mem_start,
+ge::Status GraphMemoryAssigner::SetAtomicCleanAttr(const NodePtr &node, const vector<int64_t> &atomic_mem_start,
                                                    const vector<int64_t> &atomic_mem_size) {
-  for (ge::NodePtr &node : compute_graph_->GetAllNodes()) {
-    auto node_op_desc = node->GetOpDesc();
-    GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
-
-    bool is_valid_atomic_clean_node = (n != nullptr) && (node->GetName() == n->GetName());
-
-    if (((n == nullptr) && (node_op_desc->GetType() == ATOMICADDRCLEAN))) {
-      bool is_independent_atomic_clean = false;
-      if (IsIndependentAtomicClean(node, is_independent_atomic_clean) != SUCCESS) {
-        GELOGE(FAILED, "Failed to determine the connection relationship of atomic addr clean node.");
-        return PARAM_INVALID;
-      }
-
-      is_valid_atomic_clean_node = is_valid_atomic_clean_node || (!is_independent_atomic_clean);
+  auto node_op_desc = node->GetOpDesc();
+  if (node_op_desc != nullptr) {
+    GELOGD("Node %s, set atomic clean attr start.", node->GetName().c_str());
+    vector<int64_t> workspace_vector = node_op_desc->GetWorkspace();
+    vector<int64_t> workspace_byte_vector = node_op_desc->GetWorkspaceBytes();
+    workspace_vector.insert(workspace_vector.end(), atomic_mem_start.begin(), atomic_mem_start.end());
+    workspace_byte_vector.insert(workspace_byte_vector.end(), atomic_mem_size.begin(), atomic_mem_size.end());
+    node_op_desc->SetWorkspace(workspace_vector);
+    node_op_desc->SetWorkspaceBytes(workspace_byte_vector);
+
+    std::vector<int64_t> mem_start_vector;
+    // If GetListInt fail, mem_start_vector is empty.
+    (void)ge::AttrUtils::GetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_START, mem_start_vector);
+    mem_start_vector.insert(mem_start_vector.end(), atomic_mem_start.begin(), atomic_mem_start.end());
+    GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_START, mem_start_vector),
+                     GELOGE(FAILED, "SetListInt failed.");
+                     return FAILED);
+
+    std::vector<int64_t> mem_size_vector;
+    // If GetListInt fail, mem_size_vector is empty.
+    (void)ge::AttrUtils::GetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_MEM_SIZE, mem_size_vector);
+    mem_size_vector.insert(mem_size_vector.end(), atomic_mem_size.begin(), atomic_mem_size.end());
+    GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_MEM_SIZE, mem_size_vector),
+                     GELOGE(FAILED, "SetListInt failed.");
+                     return FAILED);
+
+    std::stringstream ss;
+    for (auto iter : atomic_mem_start) {
+      ss << iter << " ";
     }
-
-    if (is_valid_atomic_clean_node) {
-      GELOGD("Node %s, set atomic clean attr start.", node->GetName().c_str());
-      vector<int64_t> workspace_vector = node_op_desc->GetWorkspace();
-      vector<int64_t> workspace_byte_vector = node_op_desc->GetWorkspaceBytes();
-      workspace_vector.insert(workspace_vector.end(), atomic_mem_start.begin(), atomic_mem_start.end());
-      workspace_byte_vector.insert(workspace_byte_vector.end(), atomic_mem_size.begin(), atomic_mem_size.end());
-      node_op_desc->SetWorkspace(workspace_vector);
-      node_op_desc->SetWorkspaceBytes(workspace_byte_vector);
-
-      std::vector<int64_t> mem_start_vector;
-      // If GetListInt fail, mem_start_vector is empty.
-      (void)ge::AttrUtils::GetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_START, mem_start_vector);
-      mem_start_vector.insert(mem_start_vector.end(), atomic_mem_start.begin(), atomic_mem_start.end());
-      GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_START, mem_start_vector),
-                       GELOGE(FAILED, "SetListInt failed.");
-                       return FAILED);
-
-      std::vector<int64_t> mem_size_vector;
-      // If GetListInt fail, mem_size_vector is empty.
-      (void)ge::AttrUtils::GetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_MEM_SIZE, mem_size_vector);
-      mem_size_vector.insert(mem_size_vector.end(), atomic_mem_size.begin(), atomic_mem_size.end());
-      GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListInt(node_op_desc, ATTR_NAME_AUTOMIC_ADD_MEM_SIZE, mem_size_vector),
-                       GELOGE(FAILED, "SetListInt failed.");
-                       return FAILED);
-
-      std::stringstream ss;
-      for (auto iter : atomic_mem_start) {
-        ss << iter << " ";
-      }
-      string atomic_mem_start_str = ss.str();
-      ss.clear();
-      ss.str("");
-      for (auto iter : atomic_mem_size) {
-        ss << iter << " ";
-      }
-      string atomic_mem_size_str = ss.str();
-
-      GELOGI("[IMAS]SetAtomicCleanAttr : Set graph[%s] atomic_node[%s] output offset [%s] size[%s] streamid[%ld]",
-             node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(),
-             atomic_mem_start_str.c_str(), atomic_mem_size_str.c_str(), node->GetOpDesc()->GetStreamId());
+    string atomic_mem_start_str = ss.str();
+    ss.clear();
+    ss.str("");
+    for (auto iter : atomic_mem_size) {
+      ss << iter << " ";
     }
+    string atomic_mem_size_str = ss.str();
+
+    GELOGI("[IMAS]SetAtomicCleanAttr : Set graph[%s] atomic_node[%s] output offset [%s] size[%s] streamid[%ld]",
+           node->GetOwnerComputeGraph()->GetName().c_str(), node_op_desc->GetName().c_str(),
+           atomic_mem_start_str.c_str(), atomic_mem_size_str.c_str(), node->GetOpDesc()->GetStreamId());
   }
   return SUCCESS;
 }
diff --git a/src/ge/graph/build/memory/graph_mem_assigner.h b/src/ge/graph/build/memory/graph_mem_assigner.h
index e1e408be..201e6d01 100644
--- a/src/ge/graph/build/memory/graph_mem_assigner.h
+++ b/src/ge/graph/build/memory/graph_mem_assigner.h
@@ -135,6 +135,9 @@ class GraphMemoryAssigner {
 
   ge::Status ReAssignAtomicMemory(bool is_loop_graph);
 
+  ge::Status FilterAtomicNodesForMemoryAssign(std::map<NodePtr, vector<NodePtr>> &normal_atomic_nodes_map,
+                                              std::vector<NodePtr> &connecting_output_atomic_nodes);
+
   ge::Status AssignContinuousInputMemory(const ge::NodePtr &node, int64_t &continuous_mem_start,
                                          int64_t &continuous_mem_size);
 
@@ -165,14 +168,8 @@ class GraphMemoryAssigner {
 
   ge::Status SetIndependentAtomicAttr(const ge::NodePtr &node, int64_t atomic_mem_start,
                                       const std::vector<int64_t> &mem_offset_end);
-  ///
-  /// @brief set loop graph atomic attr
-  /// @param node, atomic memory assignment start offset
-  /// @param atomic_mem_start: atomic op memory start address
-  ///
-  ge::Status SetLoopGraphAtomicAttr(const ge::NodePtr &node, int64_t atomic_mem_start);
 
-  ge::Status SetAtomicCleanAttr(const ge::NodePtr &n, const std::vector<int64_t> &atomic_mem_start,
+  ge::Status SetAtomicCleanAttr(const ge::NodePtr &node, const std::vector<int64_t> &atomic_mem_start,
                                 const std::vector<int64_t> &atomic_mem_size);
 
   ge::Status IsIndependentAtomicClean(const ge::NodePtr &node, bool &is_independent_atomic_clean_node);
diff --git a/src/ge/graph/load/new_model_manager/data_dumper.cc b/src/ge/graph/load/new_model_manager/data_dumper.cc
index e4e3a63f..c6283d92 100644
--- a/src/ge/graph/load/new_model_manager/data_dumper.cc
+++ b/src/ge/graph/load/new_model_manager/data_dumper.cc
@@ -695,11 +695,7 @@ Status DataDumper::LoadDumpInfo() {
     }
     if (dump_properties_.GetDumpMode() == kDumpInput) {
       if (op_iter.is_task) {
-        Status ret = DumpInput(op_iter, task);
-        if (ret != SUCCESS) {
-          GELOGE(ret, "Dump input failed");
-          return ret;
-        }
+        GE_CHK_STATUS_RET(DumpInput(op_iter, task), "Dump input failed");
       }
       op_mapping_info.mutable_task()->Add(std::move(task));
       continue;
@@ -726,7 +722,7 @@ Status DataDumper::LoadDumpInfo() {
 
   SetOpDebugIdToAicpu(op_debug_task_id_, op_debug_stream_id_, op_debug_addr_, op_mapping_info);
 
-  if (!op_list_.empty() || is_op_debug_) {
+  if (!op_list_.empty() || is_op_debug_ || is_end_graph_) {
     auto ret = ExecuteLoadDumpInfo(op_mapping_info);
     if (ret != SUCCESS) {
       GELOGE(ret, "Execute load dump info failed");
@@ -740,7 +736,6 @@ void DataDumper::SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id,
                                       aicpu::dump::OpMappingInfo &op_mapping_info) {
   if (dump_properties_.GetDumpMode() == kDumpOutput || dump_properties_.GetDumpMode() == kDumpInput ||
       dump_properties_.GetDumpMode() == kDumpAll) {
-    GELOGI("Add end_graph_info to aicpu, task_id is %u, stream_id is %u", end_graph_task_id_, end_graph_stream_id_);
     aicpu::dump::Task task;
     task.set_end_graph(true);
     task.set_task_id(end_graph_task_id_);
@@ -748,6 +743,14 @@ void DataDumper::SetEndGraphIdToAicpu(uint32_t task_id, uint32_t stream_id,
     task.mutable_op()->set_op_name(NODE_NAME_END_GRAPH);
     task.mutable_op()->set_op_type(ENDGRAPH);
     op_mapping_info.mutable_task()->Add(std::move(task));
+
+    is_end_graph_ = true;
+    if (op_mapping_info.model_name_param_case() == aicpu::dump::OpMappingInfo::kModelName) {
+      GELOGI("Add end_graph_info to aicpu, model_name is %s, task_id is %u, stream_id is %u",
+             op_mapping_info.model_name().c_str(), end_graph_task_id_, end_graph_stream_id_);
+      return;
+    }
+    GELOGI("Add end_graph_info to aicpu, task_id is %u, stream_id is %u", end_graph_task_id_, end_graph_stream_id_);
   }
 }
 
diff --git a/src/ge/graph/load/new_model_manager/data_dumper.h b/src/ge/graph/load/new_model_manager/data_dumper.h
index 0a1c2274..30218416 100644
--- a/src/ge/graph/load/new_model_manager/data_dumper.h
+++ b/src/ge/graph/load/new_model_manager/data_dumper.h
@@ -116,6 +116,7 @@ class DataDumper {
   std::vector<InnerDumpInfo> op_list_;
   uint32_t end_graph_task_id_ = 0;
   uint32_t end_graph_stream_id_ = 0;
+  bool is_end_graph_ = false;
   std::multimap<std::string, InnerInputMapping> input_map_;
   bool load_flag_;
   uint32_t device_id_;
diff --git a/src/ge/graph/load/new_model_manager/davinci_model.cc b/src/ge/graph/load/new_model_manager/davinci_model.cc
index 81eb4bc9..50867782 100644
--- a/src/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/src/ge/graph/load/new_model_manager/davinci_model.cc
@@ -1928,13 +1928,7 @@ Status DavinciModel::SinkModelProfile() {
     name = name_;
   }
   size_t name_len = name.size();
-  // phy device id
-  uint32_t phy_device_id = 0;
-  rtError_t rt_ret = rtGetDevicePhyIdByIndex(device_id_, &phy_device_id);
-  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE,
-                  GELOGE(rt_ret, "runtime get phy_device_id failed, current phy_device_id:%u", phy_device_id);
-                  return FAILED);
-  reporter_data.deviceId = phy_device_id;
+  reporter_data.deviceId = device_id_;
   reporter_data.data = (unsigned char *)&name_len;
   reporter_data.dataLen = sizeof(int32_t);
   GE_CHK_BOOL_EXEC(reporter->Report(&reporter_data) == SUCCESS, return FAILED, "Reporter data fail, model id:%u.",
@@ -2103,12 +2097,7 @@ Status DavinciModel::SinkTimeProfile(const InputData &current_data) {
   GE_CHK_BOOL_EXEC(memcpy_s(reporter_data.tag, MSPROF_ENGINE_MAX_TAG_LEN, tag_name.c_str(), tag_name.size()) == EOK,
                    return FAILED, "Sink model tag memcpy error.");
   // device id
-  uint32_t phy_device_id = 0;
-  rtError_t rt_ret = rtGetDevicePhyIdByIndex(device_id_, &phy_device_id);
-  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE,
-                  GELOGE(rt_ret, "runtime get phy_device_id failed, current phy_device_id:%u", phy_device_id);
-                  return FAILED);
-  reporter_data.deviceId = phy_device_id;
+  reporter_data.deviceId = device_id_;
 
   // Model Header
   string name;
diff --git a/src/ge/graph/load/new_model_manager/model_manager.cc b/src/ge/graph/load/new_model_manager/model_manager.cc
index f6995052..4a596738 100644
--- a/src/ge/graph/load/new_model_manager/model_manager.cc
+++ b/src/ge/graph/load/new_model_manager/model_manager.cc
@@ -236,7 +236,6 @@ ModelManager::~ModelManager() {
   std::lock_guard<std::mutex> lock(map_mutex_);
   model_map_.clear();
   model_aicpu_kernel_.clear();
-  cust_aicpu_so_.clear();
 
   GE_IF_BOOL_EXEC(device_count > 0, GE_CHK_RT(rtDeviceReset(0)));
 }
@@ -400,6 +399,7 @@ Status ModelManager::Unload(uint32_t model_id) {
   }
   std::lock_guard<std::mutex> lock(exeception_infos_mutex_);
   exception_infos_.clear();
+  cust_aicpu_so_.clear();
   return SUCCESS;
 }
 
diff --git a/src/ge/graph/load/new_model_manager/model_utils.cc b/src/ge/graph/load/new_model_manager/model_utils.cc
index 9cbb684f..2bb111f3 100644
--- a/src/ge/graph/load/new_model_manager/model_utils.cc
+++ b/src/ge/graph/load/new_model_manager/model_utils.cc
@@ -328,15 +328,14 @@ vector<void *> ModelUtils::GetInputDataAddrs(const RuntimeParam &model_param, Co
            op_desc->GetName().c_str(), v_memory_type.size(), inputs_size);
     return v_input_data_addr;
   }
-  for (size_t i = 0; i < inputs_size; ++i) {
+  for (size_t i = 0; i < op_desc->GetAllInputsSize(); ++i) {
+    const GeTensorDescPtr tensor_desc = op_desc->MutableInputDesc(static_cast<uint32_t>(i));
+    if (tensor_desc == nullptr) {
+      GELOGD("Op: %s, Index: %zu, has no input", op_desc->GetName().c_str(), i);
+      continue;
+    }
     if ((i < v_is_input_const.size()) && v_is_input_const[i] && (op_type != NETOUTPUT)) {
       // TBE: add weights address to input
-      const GeTensorDescPtr tensor_desc = op_desc->MutableInputDesc(i);
-      if (tensor_desc == nullptr) {
-        GELOGW("Op: %s, Index: %zu, Tensor Desc is null", op_desc->GetName().c_str(), i);
-        continue;
-      }
-
       int64_t tensor_size = 0;
       GE_CHK_STATUS(TensorUtils::GetSize(*tensor_desc, tensor_size));
       if (tensor_size) {
diff --git a/src/ge/graph/passes/attach_stream_label_pass.cc b/src/ge/graph/passes/attach_stream_label_pass.cc
index b8065325..6b718418 100644
--- a/src/ge/graph/passes/attach_stream_label_pass.cc
+++ b/src/ge/graph/passes/attach_stream_label_pass.cc
@@ -89,16 +89,13 @@ Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) {
   nodes.push(node);
 
   static const std::set<std::string> end_type_set = {STREAMSWITCH, STREAMMERGE, MERGE};
-  bool merge_flag = false;
-  bool exit_flag = false;
-  bool net_output_flag = false;
   while (!nodes.empty()) {
     NodePtr cur_node = nodes.top();
     nodes.pop();
     if (visited.count(cur_node) > 0) {
       continue;
     }
-    if (AttachFlag(cur_node, stream_label, merge_flag, exit_flag, net_output_flag) != SUCCESS) {
+    if (AttachFlag(cur_node, stream_label) != SUCCESS) {
       GELOGE(FAILED, "Attach flag for node %s failed.", cur_node->GetName().c_str());
       return FAILED;
     }
@@ -122,12 +119,6 @@ Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) {
     GE_CHK_STATUS_RET(SetActiveLabelList(node, {stream_label}), "set active_label_list failed.");
   }
 
-  bool attach_flag = (merge_flag || exit_flag) && net_output_flag;
-  if (attach_flag) {
-    GELOGI("No need to keep on attaching label.");
-    return SUCCESS;
-  }
-
   for (const NodePtr &tmp_node : branch_nodes) {
     GELOGD("Attach label %s to node: %s.", stream_label.c_str(), tmp_node->GetName().c_str());
     GE_CHK_STATUS_RET(SetStreamLabel(tmp_node, stream_label), "Set stream label failed.");
@@ -140,13 +131,9 @@ Status AttachStreamLabelPass::UpdateCondBranch(const NodePtr &node) {
 /// @brief attach flag
 /// @param [in] node
 /// @param [out] stream_label
-/// @param [out] merge_flag
-/// @param [out] exit_flag
-/// @param [out] net_output_flag
 /// @return Status
 ///
-Status AttachStreamLabelPass::AttachFlag(const NodePtr &node, std::string &stream_label, bool &merge_flag,
-                                         bool &exit_flag, bool &net_output_flag) {
+Status AttachStreamLabelPass::AttachFlag(const NodePtr &node, std::string &stream_label) {
   const std::string &type = node->GetType();
   if (type == STREAMSWITCH) {
     if (node->GetInDataNodes().empty()) {
@@ -164,12 +151,8 @@ Status AttachStreamLabelPass::AttachFlag(const NodePtr &node, std::string &strea
   } else if (type == STREAMMERGE) {
     stream_label = node->GetName();
     GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed.");
-    merge_flag = true;
   } else if ((type == EXIT) || (type == REFEXIT)) {
     GE_CHK_STATUS_RET(SetStreamLabel(node, stream_label), "Set stream label failed.");
-    exit_flag = true;
-  } else if (type == NETOUTPUT) {
-    net_output_flag = true;
   }
 
   return SUCCESS;
diff --git a/src/ge/graph/passes/attach_stream_label_pass.h b/src/ge/graph/passes/attach_stream_label_pass.h
index 5820480d..28e828b5 100644
--- a/src/ge/graph/passes/attach_stream_label_pass.h
+++ b/src/ge/graph/passes/attach_stream_label_pass.h
@@ -50,13 +50,9 @@ class AttachStreamLabelPass : public GraphPass {
   /// @brief attach flag
   /// @param [in] node
   /// @param [out] stream_label
-  /// @param [out] merge_flag
-  /// @param [out] exit_flag
-  /// @param [out] net_output_flag
   /// @return Status
   ///
-  static Status AttachFlag(const NodePtr &node, std::string &stream_label, bool &merge_flag, bool &exit_flag,
-                           bool &net_output_flag);
+  static Status AttachFlag(const NodePtr &node, std::string &stream_label);
 
   ///
   /// @brief Update stream_label for loop_branch
diff --git a/src/ge/graph/passes/enter_pass.cc b/src/ge/graph/passes/enter_pass.cc
index 84621689..ad3d78fc 100644
--- a/src/ge/graph/passes/enter_pass.cc
+++ b/src/ge/graph/passes/enter_pass.cc
@@ -20,13 +20,14 @@
 #include "framework/common/debug/log.h"
 #include "graph/utils/graph_utils.h"
 
+namespace {
+const size_t kOutNodesNum = 1;
+}
+
 namespace ge {
 Status EnterPass::Run(NodePtr &node) {
   GELOGD("EnterPass running");
-  if (node == nullptr) {
-    GELOGE(PARAM_INVALID, "param [node] must not be null.");
-    return PARAM_INVALID;
-  }
+  GE_CHECK_NOTNULL(node);
 
   if ((node->GetType() != ENTER) && (node->GetType() != REFENTER)) {
     return SUCCESS;
@@ -38,18 +39,17 @@ Status EnterPass::Run(NodePtr &node) {
     return PARAM_INVALID;
   }
   NodePtr in_node = node->GetInDataNodes().at(0);
-  if (in_node == nullptr) {
-    GELOGE(PARAM_INVALID, "param [in_node] must not be null");
-    return PARAM_INVALID;
-  }
+  GE_CHECK_NOTNULL(in_node);
 
   if ((in_node->GetType() != CONSTANT) && (in_node->GetType() != CONSTANTOP)) {
     return SUCCESS;
   }
 
-  bool need_remove_flag =
-    in_node->GetInControlNodes().empty() && node->GetInControlNodes().empty() && node->GetOutDataNodes().empty();
-  if (need_remove_flag) {
+  bool need_remove_flag = in_node->GetInControlNodes().empty() && node->GetInControlNodes().empty();
+  if (!need_remove_flag) {
+    return SUCCESS;
+  }
+  if (node->GetOutDataNodes().empty()) {
     for (auto &out_ctrl_node : node->GetOutControlNodes()) {
       if (out_ctrl_node == nullptr) {
         continue;
@@ -60,9 +60,47 @@ Status EnterPass::Run(NodePtr &node) {
         return FAILED;
       }
     }
+  } else {
+    if (OptimizeEnter(node, in_node) != SUCCESS) {
+      GELOGE(FAILED, "Optimize enter node[%s] failed.", node->GetName().c_str());
+      return FAILED;
+    }
   }
 
   GELOGD("EnterPass success");
   return SUCCESS;
 }
+
+Status EnterPass::OptimizeEnter(NodePtr &node, NodePtr &in_node) {
+  auto out_nodes_of_in_node = in_node->GetOutAllNodes();
+  if (out_nodes_of_in_node.size() != kOutNodesNum) {
+    return SUCCESS;
+  }
+
+  if (!node->GetOutControlNodes().empty()) {
+    return SUCCESS;
+  }
+
+  for (const auto &out_node : node->GetOutDataNodes()) {
+    GE_CHECK_NOTNULL(out_node);
+    if (out_node->GetType() == MERGE) {
+      return SUCCESS;
+    }
+  }
+
+  GE_CHECK_NOTNULL(in_node->GetOutDataAnchor(0));
+  GE_CHK_STATUS_RET(in_node->GetOutDataAnchor(0)->Unlink(node->GetInDataAnchor(0)));
+  auto out_data_anchor = node->GetOutDataAnchor(0);
+  GE_CHECK_NOTNULL(out_data_anchor);
+  for (auto peer_in_data_anchor : out_data_anchor->GetPeerInDataAnchors()) {
+    GE_CHK_STATUS_RET(out_data_anchor->Unlink(peer_in_data_anchor));
+    GE_CHK_STATUS_RET(in_node->GetOutDataAnchor(0)->LinkTo(peer_in_data_anchor));
+  }
+
+  auto graph = node->GetOwnerComputeGraph();
+  GE_CHK_STATUS_RET(GraphUtils::RemoveNodeWithoutRelink(graph, node))
+  AddRePassNodesWithInOut(in_node);
+
+  return SUCCESS;
+}
 }  // namespace ge
diff --git a/src/ge/graph/passes/enter_pass.h b/src/ge/graph/passes/enter_pass.h
index 04ac62ee..73702c38 100644
--- a/src/ge/graph/passes/enter_pass.h
+++ b/src/ge/graph/passes/enter_pass.h
@@ -23,6 +23,9 @@ namespace ge {
 class EnterPass : public BaseNodePass {
  public:
   Status Run(NodePtr &node) override;
+
+ private:
+  Status OptimizeEnter(NodePtr &node, NodePtr &in_node);
 };
 }  // namespace ge
 #endif  // GE_GRAPH_PASSES_ENTER_PASS_H_
diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.cc b/src/ge/graph/preprocess/multi_batch_copy_graph.cc
index 331d9c31..336527fb 100644
--- a/src/ge/graph/preprocess/multi_batch_copy_graph.cc
+++ b/src/ge/graph/preprocess/multi_batch_copy_graph.cc
@@ -41,7 +41,6 @@
 #include "inc/pass_manager.h"
 #include "graph/common/local_context.h"
 
-using std::map;
 using std::set;
 using std::string;
 using std::vector;
@@ -266,24 +265,27 @@ Status MultiBatchGraphCopyer::Init() {
 }
 
 Status MultiBatchGraphCopyer::LabelStatus() {
-  map<string, vector<NodePtr>> frame_enters;
-  InitStatus(frame_enters);
-
+  for (const auto &data : origin_data_nodes_) {
+    auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape();
+    if (!IsAllDimsPositive(data_shape.GetDims())) {
+      origin_nodes_status_[data.get()] = kNodeInBatchBranch;
+    }
+  }
   bool changed = true;
   // If anyone of in node is kNodeInBatchBranch, it is also kNodeInBatchBranch
   while (changed) {
     changed = false;
     for (const auto &node : origin_all_nodes_) {
+      auto iter = origin_nodes_status_.find(node.get());
+      if (iter != origin_nodes_status_.end()) {
+        continue;
+      }
       for (auto &in_node : node->GetInAllNodes()) {
         bool is_in_batch = origin_nodes_status_.find(in_node.get()) != origin_nodes_status_.end() &&
                            origin_nodes_status_[in_node.get()] == kNodeInBatchBranch;
         if (is_in_batch) {
-          if (origin_nodes_status_.find(node.get()) == origin_nodes_status_.end() ||
-              origin_nodes_status_[node.get()] != kNodeInBatchBranch) {
-            origin_nodes_status_[node.get()] = kNodeInBatchBranch;
-            ResetEnterStatus(frame_enters, node);
-            changed = true;
-          }
+          origin_nodes_status_[node.get()] = kNodeInBatchBranch;
+          changed = true;
           break;
         }
       }
@@ -314,45 +316,6 @@ Status MultiBatchGraphCopyer::LabelStatus() {
   return SUCCESS;
 }
 
-void MultiBatchGraphCopyer::InitStatus(map<string, vector<NodePtr>> &frame_enters) {
-  for (const auto &node : origin_all_nodes_) {
-    if (node->GetType() != ENTER && node->GetType() != REFENTER) {
-      continue;
-    }
-    auto op_desc = node->GetOpDesc();
-    if (op_desc == nullptr) {
-      continue;
-    }
-    string frame_name;
-    if (AttrUtils::GetStr(op_desc, ENTER_ATTR_FRAME_NAME, frame_name)) {
-      frame_enters[frame_name].emplace_back(node);
-    }
-  }
-
-  for (const auto &data : origin_data_nodes_) {
-    auto data_shape = NodeUtils::GetOutputDesc(*data, kDataOutIndex).GetShape();
-    if (!IsAllDimsPositive(data_shape.GetDims())) {
-      origin_nodes_status_[data.get()] = kNodeInBatchBranch;
-    }
-  }
-}
-
-void MultiBatchGraphCopyer::ResetEnterStatus(map<string, vector<NodePtr>> &frame_enters, const NodePtr &node) {
-  if (node->GetType() != ENTER && node->GetType() != REFENTER) {
-    return;
-  }
-
-  for (const auto &frame_enter : frame_enters) {
-    auto &enters = frame_enter.second;
-    if (std::find(enters.begin(), enters.end(), node) != enters.end()) {
-      for (const auto &enter : enters) {
-        origin_nodes_status_[enter.get()] = kNodeInBatchBranch;
-      }
-      break;
-    }
-  }
-}
-
 Status MultiBatchGraphCopyer::CreateNewNodes() {
   shape_data_ = InsertShapeDataNode();
   if (shape_data_ == nullptr) {
@@ -1200,7 +1163,7 @@ void GetDynamicShapeByMerge(const ComputeGraphPtr &graph, const NodePtr &node, s
   }
 }
 
-// Connect NetOutput directly: DTS2020070612498
+// Connect NetOutput directly
 void GetDirectOutputShape(const ComputeGraphPtr &graph, const NodePtr &node, const set<size_t> &dynamic_output_index,
                           vector<string> &dynamic_output_dims) {
   GELOGD("Try get directly shape info, Graph: %s, Node: %s", graph->GetName().c_str(), node->GetName().c_str());
diff --git a/src/ge/graph/preprocess/multi_batch_copy_graph.h b/src/ge/graph/preprocess/multi_batch_copy_graph.h
index f665b65e..062b98d2 100644
--- a/src/ge/graph/preprocess/multi_batch_copy_graph.h
+++ b/src/ge/graph/preprocess/multi_batch_copy_graph.h
@@ -68,8 +68,6 @@ class MultiBatchGraphCopyer {
 
   // label status for origin_all_nodes_
   Status LabelStatus();
-  void InitStatus(std::map<string, vector<NodePtr>> &frame_enters);
-  void ResetEnterStatus(std::map<string, vector<NodePtr>> &frame_enters, const NodePtr &node);
   // add nodes functions
   Status CreateNewNodes();
 
diff --git a/src/ge/host_cpu_engine/module.mk b/src/ge/host_cpu_engine/module.mk
index 41de4503..e35c68c9 100644
--- a/src/ge/host_cpu_engine/module.mk
+++ b/src/ge/host_cpu_engine/module.mk
@@ -40,7 +40,7 @@ include ${BUILD_HOST_SHARED_LIBRARY}
 include $(CLEAR_VARS)
 LOCAL_MODULE := atclib/libhost_cpu_engine
 LOCAL_CFLAGS += -Werror
-LOCAL_CFLAGS += -std=c++11
+LOCAL_CFLAGS += -std=c++11 -DCOMPILE_OMG_PACKAGE
 LOCAL_LDFLAGS :=
 
 LOCAL_STATIC_LIBRARIES :=
diff --git a/src/ge/init/gelib.cc b/src/ge/init/gelib.cc
index ec56cc0a..e00268ea 100644
--- a/src/ge/init/gelib.cc
+++ b/src/ge/init/gelib.cc
@@ -165,8 +165,10 @@ Status GELib::SystemInitialize(const map<string, string> &options) {
     }
   }
 
-  // In train and infer, profiling is always needed.
   InitOptions(options);
+
+  // In train and infer, profiling is always needed.
+  InitProfiling(this->options_);
   auto model_manager = ModelManager::GetInstance();
   GE_CHECK_NOTNULL(model_manager);
   GE_IF_BOOL_EXEC(model_manager->EnableExceptionDump(options) != SUCCESS,
@@ -176,21 +178,19 @@ Status GELib::SystemInitialize(const map<string, string> &options) {
   // 2.`(!is_train_mode_) && (options_.device_id != kDefaultDeviceIdForInfer)` means case: online infer
   // these two case with logical device id
   if (is_train_mode_ || (options_.device_id != kDefaultDeviceIdForInfer)) {
-    InitProfiling(this->options_, true);
     status = InitSystemWithOptions(this->options_);
   } else {
-    InitProfiling(this->options_);
     status = InitSystemWithoutOptions();
   }
   return status;
 }
 
-void GELib::InitProfiling(Options &options, bool convert_2_phy_device_id) {
+void GELib::InitProfiling(Options &options) {
   GELOGI("Init Profiling. session Id: %ld, device id:%d ", options.session_id, options.device_id);
   std::lock_guard<std::mutex> lock(status_mutex_);
   GetContext().Init();
   // Profiling init
-  if (ProfilingManager::Instance().Init(options, convert_2_phy_device_id) != SUCCESS) {
+  if (ProfilingManager::Instance().Init(options) != SUCCESS) {
     GELOGW("Profiling init failed.");
   }
 }
diff --git a/src/ge/init/gelib.h b/src/ge/init/gelib.h
index c8b3ff8a..b5621dfd 100644
--- a/src/ge/init/gelib.h
+++ b/src/ge/init/gelib.h
@@ -68,7 +68,7 @@ class GELib {
   // get incre build cache path
   const std::string &GetIncreBuildCachePath() const { return incre_build_cache_path_; }
 
-  void InitProfiling(Options &options, bool convert_2_phy_device_id = false);
+  void InitProfiling(Options &options);
   void ShutDownProfiling();
 
   Status InitSystemWithoutOptions();
diff --git a/src/ge/ir_build/atc_ir_common.cc b/src/ge/ir_build/atc_ir_common.cc
index 82ed40bd..1f8abf37 100644
--- a/src/ge/ir_build/atc_ir_common.cc
+++ b/src/ge/ir_build/atc_ir_common.cc
@@ -522,7 +522,7 @@ void PrintOptionMap(std::map<std::string, std::string> &options, std::string tip
   for (auto iter = options.begin(); iter != options.end(); iter++) {
     std::string key = iter->first;
     std::string option_name = iter->second;
-    GELOGI("%s set successfully, key=%s, value=%s", tips.c_str(), key.c_str(), option_name.c_str());
+    GELOGI("%s set successfully, option_key=%s, option_value=%s", tips.c_str(), key.c_str(), option_name.c_str());
   }
 }
 
diff --git a/src/ge/ir_build/ge_ir_build.cc b/src/ge/ir_build/ge_ir_build.cc
index 90f7a8ca..86b304c1 100644
--- a/src/ge/ir_build/ge_ir_build.cc
+++ b/src/ge/ir_build/ge_ir_build.cc
@@ -96,6 +96,12 @@ static graphStatus CheckGlobalOptions(std::map<std::string, std::string> &global
                    return ge::GRAPH_PARAM_INVALID, "check optypelist_for_implmode and op_select_implmode failed!");
   global_options[ge::ir_option::OP_SELECT_IMPL_MODE] = op_select_implmode;
 
+  // set precision mode default value
+  std::string precision_mode = global_options.find(ge::ir_option::PRECISION_MODE) == global_options.end()
+                                 ? "force_fp16"
+                                 : global_options[ge::ir_option::PRECISION_MODE];
+  global_options[ge::ir_option::PRECISION_MODE] = precision_mode;
+
   return GRAPH_SUCCESS;
 }
 
diff --git a/src/ge/opskernel_manager/ops_kernel_manager.cc b/src/ge/opskernel_manager/ops_kernel_manager.cc
index 51e8f438..11eb3061 100644
--- a/src/ge/opskernel_manager/ops_kernel_manager.cc
+++ b/src/ge/opskernel_manager/ops_kernel_manager.cc
@@ -175,25 +175,25 @@ Status OpsKernelManager::ParsePluginOptions(const map<string, string> &options,
       } else if (flag == 1) {
         enable_flag = true;
       } else {
-        GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:%s, its value %s is invalid, it must be 0 or 1.", plugin_name.c_str(),
-               iter->second.c_str());
+        GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.",
+               plugin_name.c_str(), iter->second.c_str());
         return GE_GRAPH_OPTIONS_INVALID;
       }
     } catch (std::invalid_argument &) {
-      GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.feFlag, its value %s is invalid_argument, it must be 0 or 1.",
+      GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:ge.feFlag, its value %s is invalid_argument, it must be 0 or 1.",
              iter->second.c_str());
       return GE_GRAPH_OPTIONS_INVALID;
     } catch (std::out_of_range &) {
-      GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:ge.feFlag, its value %s is out of range, it must be 0 or 1.",
+      GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:ge.feFlag, its value %s is out of range, it must be 0 or 1.",
              iter->second.c_str());
       return GE_GRAPH_OPTIONS_INVALID;
     } catch (...) {
-      GELOGE(GE_GRAPH_OPTIONS_INVALID, "Key:%s, its value %s is invalid, it must be 0 or 1.", plugin_name.c_str(),
-             iter->second.c_str());
+      GELOGE(GE_GRAPH_OPTIONS_INVALID, "option_key:%s, its value %s is invalid, it must be 0 or 1.",
+             plugin_name.c_str(), iter->second.c_str());
       return GE_GRAPH_OPTIONS_INVALID;
     }
   } else {
-    GELOGI("Not find key %s, set to default value false.", plugin_name.c_str());
+    GELOGI("Not find option_key %s, set to default value false.", plugin_name.c_str());
     enable_flag = false;
   }
 
diff --git a/src/ge/session/omg.cc b/src/ge/session/omg.cc
index bcf42032..0fb342e1 100644
--- a/src/ge/session/omg.cc
+++ b/src/ge/session/omg.cc
@@ -618,11 +618,16 @@ Status ParseOutNodes(const string &out_nodes) {
     if (!out_nodes.empty()) {
       domi::GetContext().out_nodes_map.clear();
       domi::GetContext().user_out_nodes.clear();
+      domi::GetContext().user_out_nodes_top_vec.clear();
 
       vector<string> nodes_v = StringUtils::Split(out_nodes, ';');
       for (const string &node : nodes_v) {
         vector<string> key_value_v = StringUtils::Split(node, ':');
         if (key_value_v.size() != 2) {  // The size must be 2.
+          if (key_value_v.size() == 1 && domi::GetContext().type == domi::CAFFE) {
+            domi::GetContext().user_out_nodes_top_vec.push_back(node);
+            continue;
+          }
           ErrorManager::GetInstance().ATCReportErrMessage(
             "E10001", {"parameter", "value", "reason"},
             {"--out_nodes", node, "the correct format is \"node_name1:0;node_name1:1;node_name2:0\""});
@@ -632,7 +637,13 @@ Status ParseOutNodes(const string &out_nodes) {
                  node.c_str());
           return PARAM_INVALID;
         }
-        auto iter = domi::GetContext().out_nodes_map.find(key_value_v[0]);
+        if (!domi::GetContext().user_out_nodes_top_vec.empty()) {
+          ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
+                                                          {"--out_nodes", out_nodes, "is not all index or top_name"});
+          GELOGE(PARAM_INVALID, "This out_nodes str must be all index or top_name, while the actual input is %s",
+                 out_nodes.c_str());
+          return PARAM_INVALID;
+        }
         // stoi: The method may throw an exception: invalid_argument/out_of_range
         if (!CheckDigitStr(key_value_v[1])) {
           ErrorManager::GetInstance().ATCReportErrMessage("E10001", {"parameter", "value", "reason"},
@@ -640,7 +651,10 @@ Status ParseOutNodes(const string &out_nodes) {
           GELOGE(PARAM_INVALID, "This str must be digit string, while the actual input is %s", out_nodes.c_str());
           return PARAM_INVALID;
         }
+
+        auto iter = domi::GetContext().out_nodes_map.find(key_value_v[0]);
         int32_t index = stoi(StringUtils::Trim(key_value_v[1]));
+        GELOGD("Get output info: node[%s] and index[%ld]", key_value_v[0].c_str(), index);
         if (iter != domi::GetContext().out_nodes_map.end()) {
           iter->second.emplace_back(index);
         } else {
diff --git a/src/ge/single_op/single_op.cc b/src/ge/single_op/single_op.cc
index 8e68208d..f59fb7bd 100644
--- a/src/ge/single_op/single_op.cc
+++ b/src/ge/single_op/single_op.cc
@@ -279,7 +279,7 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, con
   if (op_task_->GetOpTaskType() == OP_TASK_TBE) {
     return ExecuteTbeTask(input_desc, inputs, output_desc, outputs);
   } else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) {
-    return op_task_->LaunchKernel(input_desc, inputs, output_desc, outputs, stream_);
+    return op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_);
   } else {
     GELOGE(UNSUPPORTED, "Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u",
            op_task_->GetOpTaskType());
diff --git a/src/ge/single_op/task/build_task_utils.cc b/src/ge/single_op/task/build_task_utils.cc
index 9e97ee57..268cbfd1 100644
--- a/src/ge/single_op/task/build_task_utils.cc
+++ b/src/ge/single_op/task/build_task_utils.cc
@@ -75,8 +75,11 @@ std::string BuildTaskUtils::GetTaskInfo(const OpDescPtr &op_desc) {
     // Conv2D IN[DT_FLOAT16 NC1HWC0[256, 128, 7, 7, 16],DT_FLOAT16 FRACTAL_Z[128, 32, 16, 16]]
     // OUT[DT_FLOAT16 NC1HWC0[256, 32, 7, 7, 16]]
     ss << op_type << " IN[";
-    for (uint32_t idx = 0; idx < op_desc->GetInputsSize(); idx++) {
+    for (uint32_t idx = 0; idx < op_desc->GetAllInputsSize(); idx++) {
       const GeTensorDescPtr &input = op_desc->MutableInputDesc(idx);
+      if (input == nullptr) {
+        continue;
+      }
       ss << TypeUtils::DataTypeToSerialString(input->GetDataType()) << " ";
       ss << TypeUtils::FormatToSerialString(input->GetFormat());
       ss << VectorToString(input->GetShape().GetDims());
diff --git a/src/ge/single_op/task/op_task.cc b/src/ge/single_op/task/op_task.cc
index 0c489aa4..78db835e 100644
--- a/src/ge/single_op/task/op_task.cc
+++ b/src/ge/single_op/task/op_task.cc
@@ -34,6 +34,11 @@ constexpr int kLaunchRetryTimes = 1000;
 constexpr int kSleepTime = 10;
 constexpr uint64_t kReleaseFlag = 1;
 constexpr int kCopyNum = 2;
+void FreeHbm(void *var) {
+  if (var) {
+    (void)rtFree(var);
+  }
+}
 }  // namespace
 
 Status OpTask::OpenDump(const std::vector<uintptr_t> &io_addr, rtStream_t stream) {
@@ -336,49 +341,23 @@ Status AiCpuBaseTask::UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensor
 }
 
 AiCpuTask::~AiCpuTask() {
-  if (args_ != nullptr) {
-    (void)rtFree(args_);
-  }
-
-  if (io_addr_ != nullptr) {
-    (void)rtFree(io_addr_);
-  }
-
-  if (dynamic_flag_ && workspace_addr_ != nullptr) {
-    (void)rtFree(workspace_addr_);
-  }
-  if (copy_workspace_buf_ != nullptr) {
-    (void)rtFree(copy_workspace_buf_);
-  }
-
-  if (copy_ioaddr_dev_ != nullptr) {
-    (void)rtFree(copy_ioaddr_dev_);
-  }
-
-  if (copy_input_release_flag_dev_ != nullptr) {
-    (void)rtFree(copy_input_release_flag_dev_);
-  }
-
-  if (copy_input_data_size_dev_ != nullptr) {
-    (void)rtFree(copy_input_data_size_dev_);
-  }
-
-  if (copy_input_src_dev_ != nullptr) {
-    (void)rtFree(copy_input_src_dev_);
-  }
-
-  if (copy_input_dst_dev_ != nullptr) {
-    (void)rtFree(copy_input_dst_dev_);
-  }
-
-  if (copy_task_args_buf_ != nullptr) {
-    (void)rtFree(copy_task_args_buf_);
-  }
-
+  FreeHbm(args_);
+  FreeHbm(io_addr_);
+  if (dynamic_flag_) {
+    FreeHbm(workspace_addr_);
+  }
+  FreeHbm(copy_workspace_buf_);
+  FreeHbm(copy_ioaddr_dev_);
+  FreeHbm(copy_input_release_flag_dev_);
+  FreeHbm(copy_input_data_size_dev_);
+  FreeHbm(copy_input_src_dev_);
+  FreeHbm(copy_input_dst_dev_);
+  FreeHbm(copy_task_args_buf_);
   for (auto summary : output_summary_) {
-    if (summary != nullptr) {
-      (void)rtFree(summary);
-    }
+    FreeHbm(summary);
+  }
+  for (auto out_shape : out_shape_hbm_) {
+    FreeHbm(out_shape);
   }
 }
 
@@ -405,7 +384,7 @@ Status AiCpuTask::LaunchKernel(rtStream_t stream) {
   return SUCCESS;
 }
 
-Status AiCpuTask::PrepareCopyInputs(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm) {
+Status AiCpuTask::PrepareCopyInputs(vector<DataBuffer> &outputs) {
   std::vector<uint64_t> copy_input_release_flag;
   std::vector<uint64_t> copy_input_data_size;
   std::vector<uint64_t> copy_input_src;
@@ -417,11 +396,15 @@ Status AiCpuTask::PrepareCopyInputs(vector<void *> &outputs, const std::vector<v
            summary.shape_data_ptr, summary.shape_data_size, summary.raw_data_ptr, summary.raw_data_size);
     auto output = outputs[i];
     copy_input_release_flag.emplace_back(kReleaseFlag);
-    copy_input_data_size.emplace_back(summary.raw_data_size);
+    if (summary.raw_data_size > 0) {
+      copy_input_data_size.emplace_back(output.length);
+    } else {
+      copy_input_data_size.emplace_back(summary.raw_data_size);
+    }
     copy_input_src.emplace_back(summary.raw_data_ptr);
-    copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(output));
+    copy_input_dst.emplace_back(reinterpret_cast<uintptr_t>(output.data));
 
-    const auto &shape_buffer = out_shape_hbm[i];
+    const auto &shape_buffer = out_shape_hbm_[i];
     copy_input_release_flag.emplace_back(kReleaseFlag);
     copy_input_data_size.emplace_back(summary.shape_data_size);
     copy_input_src.emplace_back(summary.shape_data_ptr);
@@ -441,7 +424,7 @@ Status AiCpuTask::PrepareCopyInputs(vector<void *> &outputs, const std::vector<v
   return SUCCESS;
 }
 
-Status AiCpuTask::ReadResultSummaryAndPrepareMemory(std::vector<void *> &out_shape_hbm) {
+Status AiCpuTask::ReadResultSummaryAndPrepareMemory() {
   for (size_t i = 0; i < num_outputs_; ++i) {
     auto &result_summary = output_summary_host_[i];
 
@@ -449,36 +432,39 @@ Status AiCpuTask::ReadResultSummaryAndPrepareMemory(std::vector<void *> &out_sha
                            sizeof(aicpu::FWKAdapter::ResultSummary), RT_MEMCPY_DEVICE_TO_HOST));
     auto shape_data_size = result_summary.shape_data_size;
     void *shape_buffer = nullptr;
-    GE_MAKE_GUARD_RTMEM(shape_buffer);
-    GE_CHK_RT_RET(rtMalloc(&shape_buffer, shape_data_size, RT_MEMORY_HBM));
-    out_shape_hbm.emplace_back(shape_buffer);
+    if (shape_data_size > 0) {
+      GE_CHK_RT_RET(rtMalloc(&shape_buffer, shape_data_size, RT_MEMORY_HBM));
+    }
+    out_shape_hbm_.emplace_back(shape_buffer);
   }
   return SUCCESS;
 }
 
-Status AiCpuTask::CopyDataToHbm(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm, rtStream_t stream) {
-  GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs, out_shape_hbm));
+Status AiCpuTask::CopyDataToHbm(vector<DataBuffer> &outputs, rtStream_t stream) {
+  GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs));
 
   GE_CHK_RT_RET(rtKernelLaunchEx(copy_task_args_buf_, sizeof(STR_FWK_OP_KERNEL), RT_KERNEL_DEFAULT, stream));
   GE_CHK_RT_RET(rtStreamSynchronize(stream));
   return SUCCESS;
 }
 
-Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc, const std::vector<void *> &out_shape_hbm) {
+Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) {
   for (size_t i = 0; i < num_outputs_; ++i) {
     const auto &result_summary = output_summary_host_[i];
     std::vector<int64_t> shape_dims;
-    const auto &shape_hbm = out_shape_hbm[i];
-
-    uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t);
-    std::unique_ptr<int64_t[]> shape_addr(new (std::nothrow) int64_t[dim_num]());
-    GE_CHECK_NOTNULL(shape_addr);
-    GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, shape_hbm, result_summary.shape_data_size,
-                           RT_MEMCPY_DEVICE_TO_HOST));
-
-    for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) {
-      shape_dims.emplace_back(shape_addr[dim_idx]);
-      GELOGD("Node [%zu]th output dim[%u]=%ld.", i, dim_idx, shape_addr[dim_idx]);
+    if (result_summary.shape_data_size > 0) {
+      const auto &shape_hbm = out_shape_hbm_[i];
+
+      uint32_t dim_num = result_summary.shape_data_size / sizeof(int64_t);
+      std::unique_ptr<int64_t[]> shape_addr(new (std::nothrow) int64_t[dim_num]());
+      GE_CHECK_NOTNULL(shape_addr);
+      GE_CHK_RT_RET(rtMemcpy(shape_addr.get(), result_summary.shape_data_size, shape_hbm,
+                             result_summary.shape_data_size, RT_MEMCPY_DEVICE_TO_HOST));
+
+      for (uint32_t dim_idx = 0; dim_idx < dim_num; ++dim_idx) {
+        shape_dims.emplace_back(shape_addr[dim_idx]);
+        GELOGD("Node [%zu]th output dim[%u]=%ld.", i, dim_idx, shape_addr[dim_idx]);
+      }
     }
 
     GE_CHK_STATUS_RET(UpdateShapeToOutputDesc(GeShape(shape_dims), output_desc[i]),
@@ -487,7 +473,7 @@ Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc, cons
   return SUCCESS;
 }
 
-Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, vector<void *> &outputs,
+Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, vector<DataBuffer> &outputs,
                                                     rtStream_t stream) {
   if (num_outputs_ == 0) {
     GELOGI("Output num is 0, there is no need to update the output and size.");
@@ -496,13 +482,20 @@ Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output
 
   GELOGI("Update shape and data by result summary begin.");
 
-  std::vector<void *> out_shape_hbm;
-  GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(out_shape_hbm),
-                    "Read ResultSummary and update output shape failed.");
+  for (auto out_shape : out_shape_hbm_) {
+    FreeHbm(out_shape);
+  }
+  out_shape_hbm_.clear();
+  GE_CHK_STATUS_RET(ReadResultSummaryAndPrepareMemory(), "Read ResultSummary and update output shape failed.");
+
+  GE_CHK_STATUS_RET(CopyDataToHbm(outputs, stream), "Copy data to output failed.");
 
-  GE_CHK_STATUS_RET(CopyDataToHbm(outputs, out_shape_hbm, stream), "Copy data to output failed.");
+  GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(output_desc), "Update shape by hbm buffer failed.");
 
-  GE_CHK_STATUS_RET(UpdateShapeByHbmBuffer(output_desc, out_shape_hbm), "Update shape by hbm buffer failed.");
+  for (auto out_shape : out_shape_hbm_) {
+    FreeHbm(out_shape);
+  }
+  out_shape_hbm_.clear();
 
   GELOGI("Update shape and data by result summary end.");
   return SUCCESS;
@@ -603,10 +596,18 @@ Status AiCpuTask::SetMemCopyTask(const domi::KernelExDef &kernel_def) {
   return SUCCESS;
 }
 
-Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
-                               std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs,
-                               rtStream_t stream) {
+Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
+                               const std::vector<DataBuffer> &input_buffers, std::vector<GeTensorDesc> &output_desc,
+                               std::vector<DataBuffer> &output_buffers, rtStream_t stream) {
   GE_CHK_STATUS_RET_NOLOG(UpdateExtInfo(input_desc, output_desc));
+  std::vector<void *> inputs;
+  std::vector<void *> outputs;
+  for (auto &buffer : input_buffers) {
+    inputs.emplace_back(buffer.data);
+  }
+  for (auto &buffer : output_buffers) {
+    outputs.emplace_back(buffer.data);
+  }
   GE_CHK_STATUS_RET_NOLOG(SetIO(inputs, outputs));
   GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
   GE_CHK_RT_RET(rtStreamSynchronize(stream));
@@ -614,7 +615,7 @@ Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, cons
   if (unknown_type_ == DEPEND_SHAPE_RANGE) {
     GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc));
   } else if (unknown_type_ == DEPEND_COMPUTE) {
-    GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, outputs, stream));
+    GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, output_buffers, stream));
   }
 
   return SUCCESS;
@@ -658,9 +659,9 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) {
   return SUCCESS;
 }
 
-Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
-                                 std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs,
-                                 rtStream_t stream) {
+Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
+                                 const std::vector<DataBuffer> &input_buffers, std::vector<GeTensorDesc> &output_desc,
+                                 std::vector<DataBuffer> &output_buffers, rtStream_t stream) {
   GE_CHK_BOOL_RET_STATUS(unknown_type_ != DEPEND_COMPUTE, FAILED,
                          "AiCpuCCTask unknown type[%d] is depend compute, it's not supported now.", unknown_type_);
 
@@ -669,11 +670,11 @@ Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, co
   size_t arg_index = 0;
   auto *task_io_addr = reinterpret_cast<uintptr_t *>(io_addr_);
   GE_CHECK_NOTNULL(task_io_addr);
-  for (auto &input : inputs) {
-    task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(input);
+  for (auto &input : input_buffers) {
+    task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(input.data);
   }
-  for (auto &output : outputs) {
-    task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(output);
+  for (auto &output : output_buffers) {
+    task_io_addr[arg_index++] = reinterpret_cast<uintptr_t>(output.data);
   }
 
   GE_CHK_STATUS_RET_NOLOG(LaunchKernel(stream));
diff --git a/src/ge/single_op/task/op_task.h b/src/ge/single_op/task/op_task.h
index b6ea9114..5f742197 100644
--- a/src/ge/single_op/task/op_task.h
+++ b/src/ge/single_op/task/op_task.h
@@ -57,8 +57,9 @@ class OpTask {
   void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes);
   const OpDescPtr &GetOpdesc() const { return op_desc_; }
   Status OpenDump(const std::vector<uintptr_t> &io_addr, rtStream_t stream);
-  virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
-                              std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) {
+  virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &input_buffers,
+                              std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &output_buffers,
+                              rtStream_t stream) {
     return UNSUPPORTED;
   }
 
@@ -138,8 +139,9 @@ class AiCpuTask : public AiCpuBaseTask {
   OpTaskType GetOpTaskType() override { return OP_TASK_AICPU; }
   const void *GetIOAddr() const override;
 
-  Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
-                      std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) override;
+  Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &input_buffers,
+                      std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &output_buffers,
+                      rtStream_t stream) override;
   Status SetMemCopyTask(const domi::KernelExDef &kernel_def);
 
  private:
@@ -147,14 +149,14 @@ class AiCpuTask : public AiCpuBaseTask {
 
   // for copy task.
   Status InitForSummaryAndCopy();
-  Status UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, vector<void *> &outputs,
+  Status UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, vector<DataBuffer> &outputs,
                                            rtStream_t stream);
-  Status ReadResultSummaryAndPrepareMemory(std::vector<void *> &out_shape_hbm);
+  Status ReadResultSummaryAndPrepareMemory();
 
-  Status CopyDataToHbm(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm, rtStream_t stream);
-  Status PrepareCopyInputs(vector<void *> &outputs, const std::vector<void *> &out_shape_hbm);
+  Status CopyDataToHbm(vector<DataBuffer> &outputs, rtStream_t stream);
+  Status PrepareCopyInputs(vector<DataBuffer> &outputs);
 
-  Status UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc, const std::vector<void *> &out_shape_hbm);
+  Status UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc);
 
   friend class AiCpuTaskBuilder;
   void *workspace_addr_ = nullptr;
@@ -178,6 +180,8 @@ class AiCpuTask : public AiCpuBaseTask {
   void *copy_input_data_size_dev_;
   void *copy_input_src_dev_;
   void *copy_input_dst_dev_;
+
+  vector<void *> out_shape_hbm_;
 };
 
 class AiCpuCCTask : public AiCpuBaseTask {
@@ -197,8 +201,9 @@ class AiCpuCCTask : public AiCpuBaseTask {
   void SetIoAddr(void *io_addr);
   size_t GetArgSize() const;
 
-  Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<void *> &inputs,
-                      std::vector<GeTensorDesc> &output_desc, std::vector<void *> &outputs, rtStream_t stream) override;
+  Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, const std::vector<DataBuffer> &input_buffers,
+                      std::vector<GeTensorDesc> &output_desc, std::vector<DataBuffer> &output_buffers,
+                      rtStream_t stream) override;
 
  private:
   friend class AiCpuCCTaskBuilder;
diff --git a/third_party/fwkacllib/inc/ops/aipp.h b/third_party/fwkacllib/inc/ops/aipp.h
index 0c1d5112..dd01ac5f 100644
--- a/third_party/fwkacllib/inc/ops/aipp.h
+++ b/third_party/fwkacllib/inc/ops/aipp.h
@@ -25,16 +25,21 @@
 
 namespace ge {
 /**
-*@brief Performs AI pre-processing (AIPP) on images including color space conversion (CSC),
-image normalization (by subtracting the mean value or multiplying a factor), image cropping
-(by specifying the crop start and cropping the image to the size required by the neural network), and much more. \n
+*@brief Performs AI pre-processing (AIPP) on images including color space 
+conversion (CSC),
+image normalization (by subtracting the mean value or multiplying a factor), 
+image cropping
+(by specifying the crop start and cropping the image to the size required by 
+the neural network), and much more. \n
 
 *@par Inputs:
-*@li images: An NCHW or NHWC tensor of type uint8, specifying the input to the data layer.
+*@li images: An NCHW or NHWC tensor of type uint8, specifying the input to the 
+data layer.
 *@li params: Dynamic AIPP configuration parameters of type uint8. \n
 
 *@par Attributes:
-*aipp_config_path: A required string, specifying the path of the AIPP configuration file. \n
+*aipp_config_path: A required string, specifying the path of the AIPP 
+configuration file. \n
 
 *@par Outputs:
 *features: The AIPP-processed output tensor of type float16 or uint8.
diff --git a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
index 5d68b977..6d865399 100644
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@@ -28,9 +28,10 @@ namespace ge {
 
 *@par Inputs:
 *Dynamic inputs, including:
-* @li x: A list of Tensor objects, each with same shape and type. The supported types are:
+* @li x: A list of Tensor objects, each with same shape and type. The supported 
+types are:
 *   float16, float32, double, int32, uint8, int16, int8, complex64, int64,
-*   qint8, quint8, qint32, uint16, complex128, uint32, uint64. It's a dynamic input. \n
+*   qint8, quint8, qint32, uint16, complex128, uint32, uint64. \n
 
 *@par Outputs:
 *y: A Tensor. Has the same shape and type as the elements of "x". \n
@@ -121,7 +122,8 @@ REG_OP(MinimumGrad)
 
 *@par Inputs:
 *One input:
-*x:A Tensor. Must be one of the following types: bool, float16, float, int8, int32, uint32, uint8,
+*x:A Tensor. Must be one of the following types: bool, float16, float, int8, 
+int32, uint32, uint8,
    int64, uint64, int16, uint16, double, complex64, complex128, qint8, quint8, qint16, quint16, qint32. \n
 
 *@par Attributes:
@@ -385,7 +387,8 @@ REG_OP(Sign)
 
 *@par Inputs:
 *Two inputs, including: \n
-*@li x1: A Tensor. Must be one of the following types: float16, float32, float64, int32, int64, complex64,complex128
+*@li x1: A Tensor. Must be one of the following types: float16, float32,
+ float64, int32, int64, complex64,complex128
 *@li x2: A Tensor. Has the same type as "x1". \n
 
 *@par Outputs:
@@ -484,12 +487,16 @@ REG_OP(Equal)
 
 *@par Inputs:
 *One input:\n
-*x: A Tensor. Must be one of the following types: float16, float32, double, complex64, complex128. \n
+*x: A Tensor. Must be one of the following types: float16, float32, double, 
+complex64, complex128. \n
 
 *@par Attributes:
-*@li base: An optional attribute of type float32, specifying the base gamma. Defaults to "-1.0".
-*@li scale: An optional attribute of type float32, specifying the scale alpha. Defaults to "1.0".
-*@li shift: An optional attribute of type float32, specifying the shift beta. Defaults to "0.0". \n
+*@li base: An optional attribute of type float32, specifying the base gamma. 
+Defaults to "-1.0".
+*@li scale: An optional attribute of type float32, specifying the scale alpha. 
+Defaults to "1.0".
+*@li shift: An optional attribute of type float32, specifying the shift beta. 
+Defaults to "0.0". \n
 
 *@par Outputs:
 *y: A Tensor of the same type as "x". \n
@@ -510,7 +517,8 @@ REG_OP(Exp)
 
 *@par Inputs:
 *One input:
-*x: A Tensor. Must be one of the following types: float16, float32, double, complex64, complex128. \n
+*x: A Tensor. Must be one of the following types: float16, float32, double, 
+complex64, complex128. \n
 
 *@par Outputs:
 *y: A Tensor of the same type as "x". \n
@@ -527,7 +535,9 @@ REG_OP(Expm1)
 *@brief: Computes the reciprocal of "x". \n
 
 *@par Inputs:\n
-*x: A Tensor. Must be one of the following types: float16, float32, int32, int64, double, complex64, complex128. \n
+*x: A Tensor. Must be one of the following types: float16, float32,
+int32, int64, double,
+complex64, complex128. \n
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x". \n
@@ -749,7 +759,8 @@ REG_OP(Xlogy)
 
 *@par Inputs:
 *One input: \n
-*x: A Tensor. Must be one of the following types: float16, float32, float64, int32, int64, complex64, complex128
+*x: A Tensor. Must be one of the following types: float16, float32, float64,
+int32, int64, complex64, complex128
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x". \n
@@ -790,7 +801,8 @@ REG_OP(Rsqrt)
 
 *
 *@par Inputs:
-* x: A tensor. Must be one of the following types: float16, float32, float64, int32, int64, complex64, complex128.
+* x: A tensor. Must be one of the following types: float16, float32, float64, 
+int32, int64, complex64, complex128.
 *
 *@par Outputs:
 * y: A tensor. Has the same type as "x".
@@ -811,7 +823,8 @@ REG_OP(Asin)
 
 *
 *@par Inputs:
-*@li y: A tensor of type float16, float32, float64, int32, int64, complex64, complex128.
+*@li y: A tensor of type float16, float32, float64, 
+int32, int64, complex64, complex128.
 *@li dy: A tensor of the same type as "y".
 *
 *@attention Constraints:
@@ -838,7 +851,8 @@ REG_OP(AsinGrad)
 
 *
 *@par Inputs:
-* x: A tensor. Must be one of the following types: float16, float32, float64, int32, int64, complex64, complex128.
+* x: A tensor. Must be one of the following types: float16, float32, float64,
+int32, int64, complex64, complex128.
 *
 *@par Outputs:
 * y: A tensor. Has the same type as "x".
@@ -883,7 +897,8 @@ REG_OP(AcosGrad)
 
 *
 *@par Inputs:
-* x: A tensor. Must be one of the following types: float16, float32, float64, complex64, complex128.
+* x: A tensor. Must be one of the following types: float16, float32, float64,
+ complex64, complex128.
 *
 *@attention Constraints:
 * x Given an input tensor, the function computes inverse hyperbolic cosine of every element.\n
@@ -1160,7 +1175,8 @@ REG_OP(FusedMulAdd)
 
 *
 *@par Inputs:
-*@li x1: A tensor. Must be one of the following types: float16, float32, float64, uint8, int8, int16, int32, int64, complex64, complex128.
+*@li x1: A tensor. Must be one of the following types: float16, float32, float64,
+uint8, int8, int16, int32, int64, complex64, complex128.
 *@li x2: A tensor of the same type as "x1".
 *
 *@attention Constraints:
@@ -1189,7 +1205,8 @@ REG_OP(AddV2)
 *@brief Updates "ref" by adding "value" to it. \n
 
 *@par Inputs:
-*@li ref: A Tensor. Must be one of the following types: float16, float32, int8, int16, int32, int64, uint8, uint16, uint32, uint64.
+*@li ref: A Tensor. Must be one of the following types: float16, float32, int8,
+int16, int32, int64, uint8, uint16, uint32, uint64.
 *@li value: A Tensor of the same type as "ref". \n
 
 *@par Attributes:
@@ -1218,12 +1235,14 @@ REG_OP(AssignAdd)
 *@brief Updates "ref" by assigning "value" to it. \n
 
 *@par Inputs:
-*@li ref: A Tensor. Must be one of the following types: float16, float32, int8, int16, int32, int64, uint8, uint16, uint32, uint64.
+*@li ref: A Tensor. Must be one of the following types: float16, float32, int8, int16, 
+int32, int64, uint8, uint16, uint32, uint64.
 *@li value: A Tensor of the same type as "ref". \n
 
 *@par Attributes:
 *@li validate_shape: An optional bool. Defaults to "true".
-                     If "true", the operation will validate that the shape of "value" matches the shape of the Tensor being assigned to.
+                     If "true", the operation will validate that the shape of "value"
+                     matches the shape of the Tensor being assigned to.
 *                    If "false", "ref" will take on the shape of "value".
 *                    This attribute is reserved.
 *@li use_locking: An optional bool. Defaults to True.
@@ -1252,7 +1271,8 @@ REG_OP(Assign)
 
 *
 *@par Inputs:
-*@li var: A tensor. Must be one of the following types: float32, float64, int32, uint8, int16, int8, complex64, int64, qint8, quint8, qint32, uint16, complex128, uint32, uint64
+*@li var: A tensor. Must be one of the following types: float32, float64,
+int32, uint8, int16, int8, complex64, int64, qint8, quint8, qint32, uint16, complex128, uint32, uint64
 *@li value: A tensor of the same type as "var".
 *
 *@par Attributes:
@@ -1644,7 +1664,9 @@ REG_OP(Atan2)
 
 *
 *@par Inputs:
-*@li x1: A tensor. Must be one of the following types: float32, float64, int32, uint8, int16, int8, complex64, int64, qint8, quint8, qint32, uint16, complex128, float16, uint32, uint64
+*@li x1: A tensor. Must be one of the following types: float32, float64, int32,
+ uint8, int16, int8, complex64, int64, qint8, quint8, qint32, uint16, complex128,
+float16, uint32, uint64
 *@li x2: A tensor of the same type as "x1".
 *
 *@par Attributes:
@@ -1666,16 +1688,18 @@ REG_OP(ApproximateEqual)
 
 /**
 *@brief Returns the element-wise sum of a list of tensors.\n
-* AccumulateNV2 performs the same operation as AddN, but does not wait for all of its inputs
-to be ready before beginning to sum.\n This can save memory if inputs are ready at different times,
-since minimum temporary storage is proportional to the output size rather than the inputs size.
- Returns a Tensor of same shape and type as the elements of inputs. \n
+* AccumulateNV2 performs the same operation as AddN, but does not wait for all 
+of its inputs to be ready before beginning to sum.\n This can save memory if 
+inputs are ready at different times, \n since minimum temporary storage is 
+proportional to the output size rather than the inputs size.\n Returns a Tensor 
+of same shape and type as the elements of inputs. \n
 
 *
 *@par Inputs:
 *Dynamic inputs, including:
-* x: A tensor. Must be one of the following types: float32, float64, int32, uint8, int16, int8, complex64, int64,
-qint8, quint8, qint32, uint16, complex128, float16, uint32, uint64. It's a dynamic input. \n
+* x: A tensor. Must be one of the following types: float32, float64, int32, 
+uint8, int16, int8, complex64, int64, \n qint8, quint8, qint32, uint16, 
+complex128, float16, uint32, uint64.
 *
 *@par Outputs:
 * y: A tensor. Has the same type as "x".
@@ -1731,7 +1755,8 @@ REG_OP(FakeQuantWithMinMaxArgs)
 
 *@par Inputs:
 *Two inputs, including: \n
-*@li gradients: A Tensor of type float32. Backpropagated gradients above the FakeQuantWithMinMaxArgs operation.
+*@li gradients: A Tensor of type float32. Backpropagated gradients 
+above the FakeQuantWithMinMaxArgs operation.
 *@li x: A Tensor of type float32. Has the same type and format as "gradients".\n
 * This is the input Tensor of the FakeQuantWithMinMaxArgs operator.\n
 
@@ -2210,9 +2235,13 @@ REG_OP(BiasAdd)
 
 *@par Inputs:
 *Two inputs, including:
-*@li x: A Tensor. Must be one of the following types: float32, float64, int32, uint8, int16, int8, complex64, int64, qint8, quint8, qint32, bfloat16, uint16, complex128, float16, uint32, uint64.
+*@li x: A Tensor. Must be one of the following types: float32, float64, int32, 
+uint8, int16, int8, complex64, int64, qint8, quint8, qint32, bfloat16, uint16, 
+complex128, float16, uint32, uint64.
 *format is ND.
-*@li dimension: A Tensor. Must be one of the following types: int32, int64. Must be in the range [-rank(input x), rank(input x)]. Describes which dimension of the input Tensor to reduce across.
+*@li dimension: A Tensor. Must be one of the following types: int32, int64. 
+Must be in the range [-rank(input x), rank(input x)]. Describes which dimension 
+of the input Tensor to reduce across.
 * The format is ND.
 *@par Attributes:
 *dtype: The output type, either "int32" or "int64". Defaults to "int64". \n
@@ -2286,6 +2315,7 @@ REG_OP(ArgMaxV2)
     .ATTR(dtype, Type, DT_INT64)
     .OP_END_FACTORY_REG(ArgMaxV2)
 
+
 /**
 *@brief Returns the index with the largest value across axes of a tensor. \n
 
@@ -2298,15 +2328,16 @@ REG_OP(ArgMaxV2)
 *@li dtype: The output type, either "int32" or "int64". Defaults to "int64". \n
 
 *@par Outputs:
-*y: A multi-dimensional Tensor of type int32, specifying the index with the largest value. The dimension is one less than that of "x". \n
+*y: A multi-dimensional Tensor of type int32, specifying the index with the 
+largest value. The dimension is one less than that of "x". \n
 
 *@attention Constraints:
 *@li x: If there are multiple maximum values, the index of the first maximum value is used.
-*@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x". \n
+*@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the 
+dimension length of "x". \n
 
 *@par Third-party framework compatibility
 * Compatible with TensorFlow operator ArgMax.
-*
 * @par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
@@ -2929,9 +2960,13 @@ REG_OP(FusedMulAddN)
 *@li bias: An ND tensor of type float16 or float32. \n
 
 *@par Attributes:
-*@li axis: An optional int32 used to compute the shape of bias input from the online bottoms. Defaults to "1".
-*@li num_axes: An optional int32 used to compute the shape of bias input from a Caffe model trained offline. Defaults to "1".
-*@li bias_from_blob: An optional bool. If "true", bias is input from a Caffe model trained offline. If "false", bias is input from online bottoms. Defaults to "true". \n
+*@li axis: An optional int32 used to compute the shape of bias input from the 
+online bottoms. Defaults to "1".
+*@li num_axes: An optional int32 used to compute the shape of bias input from a 
+Caffe model trained offline. Defaults to "1".
+*@li bias_from_blob: An optional bool. If "true", bias is input from a Caffe 
+model trained offline. If "false", bias is input from online bottoms. Defaults 
+to "true". \n
 
 *@par Outputs:
 *y: An ND tensor of type float16 or float32. \n
@@ -2939,13 +2974,25 @@ REG_OP(FusedMulAddN)
 *@attention Constraints:\n
 * Assume that the shape length of "x" is "n" and that of "bias" is "m".
 *@li "axis" is within the range [-n, n-1]. num_axes >= -1.
-*@li If "bias_from_blob = true", "num_axes = -1", and "axis >= 0", the ith axis of "bias" and the (i+"axis")th axis of "x" must have the same size (0 <= i < n-axis).\n
-* If "axis < 0", the ith axis of "bias" and the (i+n+"axis")th axis of "x" must have the same size (0 <= i < -axis).
-*@li If "bias_from_blob = true" and "num_axes = 0", "bias" is a scalar with shape length 1 and dimension size 1.
-*@li If "bias_from_blob = true", "num_axes > 0, and "axis >= 0", "axis + num_axes" must be less than or equal to "n" and the ith axis of "bias" and the (i+"axis")th axis of "x" must have the same size (0 <= i < num_axes).\n
-* If "axis < 0", "n + axis + num_axes" must be less than or equal to "n" and the ith axis of "bias" and the (i+n+"axis")th axis of "x" must have the same size (0 <= i < num_axes).
-*@li If "bias_from_blob = false", "bias" is not a scalar, and "axis >= 0","axis + m" must be less than or equal to "n" and the ith axis of "bias" and the (i+"axis")th axis of "x" must have the same size (0 <= i < m).\n
-* If "axis < 0", "n + axis + m" must be less than or equal to "n" and the ith axis of "bias" and the (i+n+"axis")th axis of "x" must have the same size (0 <= i < m).
+*@li If "bias_from_blob = true", "num_axes = -1", and "axis >= 0", the ith axis 
+of "bias" and the (i+"axis")th axis of "x" must have the same size (0 <= i < 
+n-axis).\n
+* If "axis < 0", the ith axis of "bias" and the (i+n+"axis")th axis of "x" must 
+have the same size (0 <= i < -axis).
+*@li If "bias_from_blob = true" and "num_axes = 0", "bias" is a scalar with 
+shape length 1 and dimension size 1.
+*@li If "bias_from_blob = true", "num_axes > 0, and "axis >= 0", "axis + 
+num_axes" must be less than or equal to "n" and the ith axis of "bias" and the 
+(i+"axis")th axis of "x" must have the same size (0 <= i < num_axes).\n
+* If "axis < 0", "n + axis + num_axes" must be less than or equal to "n" and 
+the ith axis of "bias" and the (i+n+"axis")th axis of "x" must have the same 
+size (0 <= i < num_axes).
+*@li If "bias_from_blob = false", "bias" is not a scalar, and "axis >= 0","axis 
++ m" must be less than or equal to "n" and the ith axis of "bias" and the (i
++"axis")th axis of "x" must have the same size (0 <= i < m).\n
+* If "axis < 0", "n + axis + m" must be less than or equal to "n" and the ith 
+axis of "bias" and the (i+n+"axis")th axis of "x" must have the same size (0 <= 
+i < m).
 *@par Third-party framework compatibility
 * Compatible with the Caffe operator Bias.
 */
@@ -3023,10 +3070,12 @@ REG_OP(FusedMulAddNL2loss)
 *@li x: A Tensor with any format. Must be one of the following types: float16, float32. \n
 
 *@par Attributes:
-*@li threshold: A required float32. Defaults to "0.0". "x" is compared with "threshold", outputs "1" for inputs above threshold; "0" otherwise. \n
+*@li threshold: A required float32. Defaults to "0.0". "x" is compared with 
+"threshold", outputs "1" for inputs above threshold; "0" otherwise. \n
 
 *@par Outputs:
-*@li y: A Tensor with any format. Has the same type as the input. Must be one of the following types: float16, float32.
+*@li y: A Tensor with any format. Has the same type as the input. Must be one 
+of the following types: float16, float32.
 *@par Third-party framework compatibility
 * Compatible with the Caffe operator Threshold.
 */
@@ -3044,11 +3093,16 @@ REG_OP(FusedMulAddNL2loss)
 *@li x: A tensor. Must be one of the following types: float16, float32. \n
 
 *@par Attributes:
-*@li axis: An optional int. Specify the axis to be cut at the input tensor. If this parameter is not provided, find the topk for each batch. Defaults to 10000
-*@li out_max_val: An optional bool. Whether to output the maximum value. If it is True, the maximum value and index are output, otherwise only the index is output.
+*@li axis: An optional int. Specify the axis to be cut at the input tensor. If 
+this parameter is not provided, find the topk for each batch. Defaults to 10000
+*@li out_max_val: An optional bool. Whether to output the maximum value. If it 
+is True, the maximum value and index are output, otherwise only the index is 
+output.
 * Defaults to False
-*@li topk: An optional int. It means the number of top tok in each axis (the value is greater than or equal to 1), and the value range must be in [1,x.shape(axis)].
-* Defaults to 1
+*@li topk: An optional int. It means the number of top tok in each axis (the 
+value is greater than or equal to 1), and the value range must be in [1,x.shape
+(axis)].
+* Defaults to 1 \n
 
 *@par Outputs:
 *@li indices: A tensor of type float16, float32, int32. The index of the maximum value of the output.
@@ -3168,7 +3222,8 @@ REG_OP(Axpy)
     .OP_END_FACTORY_REG(Axpy)
 
 /**
-*@brief Creates a criterion that measures the loss given input tensors x1 x2 and a Tensor label y with values 1 or -1. \n
+*@brief Creates a criterion that measures the loss given input tensors x1 x2 
+and a Tensor label y with values 1 or -1. \n
 
 *@par Inputs:
 *@li x1: A ND Tensor with one of the following types: int8, uint8, int32, float16, float32.
diff --git a/third_party/fwkacllib/inc/ops/functional_ops.h b/third_party/fwkacllib/inc/ops/functional_ops.h
index 1e67c41f..bf5ebd51 100644
--- a/third_party/fwkacllib/inc/ops/functional_ops.h
+++ b/third_party/fwkacllib/inc/ops/functional_ops.h
@@ -36,7 +36,7 @@ namespace ge {
  *          if "cond" is a numerical scalar, non-zero means True and zero means False;
  *          if "cond" is a string scalar, non-empty means True and empty means False;
  *          if "cond" is not a scalar, non-empty means True and empty means False.
- *@li input: The input tensors . It's a dynamic input. \n
+ *@li input: The input tensors . \n
 
  *@par Graphs:
  *@li then_branch: A subgraph takes 'input' and returns a list of tensors,
@@ -69,7 +69,7 @@ REG_OP(_If)
  *          if "cond" is a numerical scalar, non-zero means True and zero means False;
  *          if "cond" is a string scalar, non-empty means True and empty means False;
  *          if "cond" is not a scalar, non-empty means True and empty means False.
- *@li input: The input tensors . It's a dynamic input. \n
+ *@li input: The input tensors . \n
 
  *@par Graphs:
  *@li then_branch: A subgraph takes 'input' and returns a list of tensors,
@@ -102,7 +102,7 @@ REG_OP(StatelessIf)
  *          if "cond" is a numerical scalar, non-zero means True and zero means False;
  *          if "cond" is a string scalar, non-empty means True and empty means False;
  *          if "cond" is not a scalar, non-empty means True and empty means False.
- *@li input: The input tensors . It's a dynamic input. \n
+ *@li input: The input tensors . \n
 
  *@par Graphs:
  *@li then_branch: A subgraph takes 'input' and returns a list of tensors,
@@ -129,7 +129,7 @@ REG_OP(If)
 
  *@par Inputs:
  *@li branch_index: A int32 scalar which determines the selected subgraph.
- *@li input: The input tensors, which will be passed to the subgraph . It's a dynamic input. \n
+ *@li input: The input tensors, which will be passed to the subgraph . \n
 
  *@par Graphs:
  *branches: A list of subgraphs, each of which takes 'input' and returns a list of tensors,
@@ -152,7 +152,7 @@ REG_OP(Case)
  *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n
 
  *@par Inputs:
- *input: The input tensors . It's a dynamic input. \n
+ *input: The input tensors . \n
 
  *@par Graphs:
  *@li cond: A subgraph takes 'input' and returns a tensor.
@@ -183,7 +183,7 @@ REG_OP(_While)
  *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n
 
  *@par Inputs:
- *input: The input tensors . It's a dynamic input. \n
+ *input: The input tensors . \n
 
  *@par Graphs:
  *@li cond: A subgraph takes 'input' and returns a tensor.
@@ -215,7 +215,7 @@ REG_OP(While)
  *@brief Cyclic execute the "body" subgraph until the return tensor of "cond" subgraph means False . \n
 
  *@par Inputs:
- *input: The input tensors . It's a dynamic input. \n
+ *input: The input tensors . \n
 
  *@par Graphs:
  *@li cond: A subgraph takes 'input' and returns a tensor.
@@ -250,7 +250,7 @@ REG_OP(StatelessWhile)
  *@li start: A int32 scalar. The lower bound.
  *@li limit: A int32 scalar. The upper bound.
  *@li delta: A int32 scalar. The step size.
- *@li input: The input tensors, which will be passed to "body" . It's a dynamic input. \n
+ *@li input: The input tensors, which will be passed to "body" . \n
 
  *@par Graphs:
  *body: A subgraph takes 'input' and returns a another list of tensors . \n
@@ -274,7 +274,7 @@ REG_OP(For)
  *@brief Pass the input tensors to the subgraph "f" and return the output tensors . \n
 
  *@par Inputs:
- *args: The input tensors, which will be passed to "f" . It's a dynamic input. \n
+ *args: The input tensors, which will be passed to "f" . \n
 
  *@par Graphs:
  *f: A subgraph takes 'args' and returns a another list of tensors . \n
@@ -303,7 +303,7 @@ REG_OP(PartitionedCall)
  *@brief Pass the input tensors to the subgraph "f" and return the output tensors . \n
 
  *@par Inputs:
- *args: The input tensors, which will be passed to "f" . It's a dynamic input. \n
+ *args: The input tensors, which will be passed to "f" . \n
 
  *@par Graphs:
  *f: A subgraph takes 'args' and returns a another list of tensors . \n
diff --git a/third_party/fwkacllib/inc/ops/image_ops.h b/third_party/fwkacllib/inc/ops/image_ops.h
index 27fb79a9..302823a2 100644
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -160,8 +160,10 @@ REG_OP(CropAndResize)
 *@li box_index: A Tensor of type int32. A 1-D tensor of shape [num_boxes] with int32 values in [0, batch) . \n
 
 *@par Attributes:
-*@li crop_size: list int. [crop_height, crop_width]. All cropped image patches are resized to this size.
-*@li extrapolation_value: An optional float. Defaults to 0. Value used for extrapolation, when applicable.
+*@li crop_size: list int. [crop_height, crop_width]. All cropped image patches 
+are resized to this size.
+*@li extrapolation_value: An optional float. Defaults to 0. Value used for 
+extrapolation, when applicable.
 *@li method: An optional string from: '"bilinear"'. Defaults to "bilinear" . \n
 
 *@par Outputs:
@@ -172,7 +174,6 @@ REG_OP(CropAndResize)
 
 *@par Third-party framework compatibility
 *Compatible with tensorflow CropAndResize operator.
-
 * @par Restrictions:
 * Warning: THIS FUNCTION IS DEPRECATED. Please use CropAndResize instead.
 */
diff --git a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
index 4fa85cbc..073d541d 100644
--- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
@@ -87,39 +87,58 @@ REG_OP(L2NormalizeGrad)
 
 *@par Inputs:
 * Five inputs, including: (NHWC, NCHW, or NC1HWC0 supported)
-*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
-*@li scale: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
+*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW 
+for 4D or NC1HWC0 for 5D.
+*@li scale: A Tensor of type float32. Must be 1D if input "x" is with format 
+NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the scaling factor.
 *@li offset: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the offset.
-*@li mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
-if input "x" is with format NC1HWC0. Specifies the mean used for inference. Must be "None" if the
+*@li mean: A Tensor of type float32. Must be 1D if input "x" is with format 
+NHWC or NCHW. Must be 5D
+if input "x" is with format NC1HWC0. Specifies the mean used for inference. 
+Must be "None" if the
 operation is used for training.
-*@li variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be
-5D if input "x" is with format NC1HWC0. Specifies the variance used for inference. Must be "None"
+*@li variance: A Tensor of type float32. Must be 1D if input "x" is with format 
+NHWC or NCHW. Must be
+5D if input "x" is with format NC1HWC0. Specifies the variance used for 
+inference. Must be "None"
 if the operation is used for training . \n
 
 *@par Attributes:
-*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.0001".
-*@li data_format: An optional string, specifying the format of "x". Defaults to "NHWC".
-*@li is_training: An optional bool, specifying if the operation is used for training or inference. Defaults to "True" . \n
+*@li epsilon: An optional float32, specifying the small value added to variance 
+to avoid dividing by zero. Defaults to "0.0001".
+*@li data_format: An optional string, specifying the format of "x". Defaults to 
+"NHWC".
+*@li is_training: An optional bool, specifying if the operation is used for 
+training or inference. Defaults to "True" . \n
 
 *@par Outputs:
 * Five outputs, including: (NHWC, NCHW, or NC1HWC0 supported)
-*@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
-*@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. Must be 5D
+*@li y: A 4D or 5D Tensor of type float16 or float32 for the normalized "x", 
+with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
+*@li batch_mean: A Tensor of type float32. Must be 1D if input "x" is with 
+format NHWC or NCHW. Must be 5D
 if input "x" is with format NC1HWC0. Specifies the mean of "x".
-*@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
+*@li batch_variance: A Tensor of type float32. Must be 1D if input "x" is with 
+format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x".
-*@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
-Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
-*@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
-Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x" for gradient computation. Pass "None" to skip this output . \n
+*@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input 
+"x" is with format NHWC or NCHW.
+Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for 
+gradient computation. Pass "None" to skip this output.
+*@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input 
+"x" is with format NHWC or NCHW.
+Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x" 
+for gradient computation. Pass "None" to skip this output . \n
 
 *@attention Constraints:
-*@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available,
-then "reserve_space_1" has the same value as "mean" and "reserve_space_2" has the same value as "variance".
-*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction . \n
+*@li If the operation is used for inference and outputs "reserve_space_1" and 
+"reserve_space_2" are available,
+then "reserve_space_1" has the same value as "mean" and "reserve_space_2" has 
+the same value as "variance".
+*@li For Ascend 310, the result accuracy fails to reach 1� due to the square 
+root instruction . \n
 
 *@par Third-party framework compatibility
 *@li Compatible with the TensorFlow operator fused_batch_norm.
@@ -166,13 +185,17 @@ is used for training or inference. Defaults to "True" . \n
 *@li y: A 4D Tensor of type float16 or float32, for the normalized "x".
 *@li batch_mean: A 1D Tensor of type float32, for the mean of "x".
 *@li batch_variance: A 1D Tensor of type float32, for the variance of "x".
-*@li reserve_space_1: A 1D Tensor of type float32, for the mean of "x" for gradient computation.
-*@li reserve_space_2: A 1D Tensor of type float32, for the variance of "x" for gradient computation . \n
+*@li reserve_space_1: A 1D Tensor of type float32, for the mean of "x" for
+gradient computation.
+*@li reserve_space_2: A 1D Tensor of type float32, for the variance of "x" 
+for gradient computation . \n
 
 *@attention Constraints:
 *@li If the operation is used for inference, then output "reserve_space_1"
-has the same value as "mean" and output "reserve_space_2" has the same value as "variance".
-*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square root instruction . \n
+has the same value as "mean" and output "reserve_space_2" has the same value as
+"variance".
+*@li For Ascend 310, the result accuracy fails to reach 1‰ due to the square 
+root instruction . \n
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator fused_batch_norm_v2.
@@ -198,23 +221,34 @@ REG_OP(BatchNormExt2)
 
 *@par Inputs:
 * Five inputs, including:
-*@li y_backprop: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0, for the gradient.
-*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0.
-*@li scale: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0.
-*@li reserve_space_1: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm.
-*@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm . \n
+*@li y_backprop: A 4D or 5D Tensor of type float16 or float32, with format 
+NHWC, NCHW, or NC1HWC0, for the gradient.
+*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, 
+or NC1HWC0.
+*@li scale: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or 
+NC1HWC0.
+*@li reserve_space_1: A 4D or 5D Tensor of type float32, with format NHWC, 
+NCHW, or NC1HWC0. It is an output of BatchNorm.
+*@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, 
+NCHW, or NC1HWC0. It is an output of BatchNorm . \n
 
 *@par Attributes:
-*@li epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x".
+*@li epsilon: An optional float32. Defaults to "0.0001". A small float number 
+added to the variance of "x".
 *@li data_format: An optional string. Defaults to "NHWC".
 *@li is_training: An optional bool. Defaults to "true". Specifies the operation is for training (default) or inference . \n
 
 *@par Outputs:
-*@li x_backprop: A Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "x".
-*@li scale_backprop: A Tensor of type float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "scale".
-*@li *offset_backprop: A Tensor of type float32, with format NHWC, NCHW, or NC1HWC0, for the offset of "offset".
-*@li *reserve_space_4: A Tensor of type float32, with shape NHWC, NCHW, or NC1HWC0. Pass "None" to skip this output.
-*@li *reserve_space_5: A Tensor of type float32, with shape NHWC, NCHW, or NC1HWC0. Pass "None" to skip this output . \n
+*@li x_backprop: A Tensor of type float16 or float32, with format NHWC, NCHW, 
+or NC1HWC0, for the offset of "x".
+*@li scale_backprop: A Tensor of type float32, with format NHWC, NCHW, or 
+NC1HWC0, for the offset of "scale".
+*@li *offset_backprop: A Tensor of type float32, with format NHWC, NCHW, or 
+NC1HWC0, for the offset of "offset".
+*@li *reserve_space_4: A Tensor of type float32, with shape NHWC, NCHW, or 
+NC1HWC0. Pass "None" to skip this output.
+*@li *reserve_space_5: A Tensor of type float32, with shape NHWC, NCHW, or 
+NC1HWC0. Pass "None" to skip this output . \n
 
 *@attention Constraints:
 * The preceding layer of this operator must be operator BatchNorm . \n
@@ -244,21 +278,28 @@ REG_OP(BatchNormGrad)
 
 *@par Inputs:
 * Five inputs, including:
-*@li y_backprop: A 4D Tensor of type float16 or float32, with format NHWC or NCHW, for the gradient.
+*@li y_backprop: A 4D Tensor of type float16 or float32, with format NHWC or 
+NCHW, for the gradient.
 *@li x: A 4D Tensor of type float16 or float32, with format NHWC or NCHW.
 *@li scale: A 4D Tensor of type float32, with format NHWC or NCHW.
-*@li reserve_space_1: A 4D Tensor of type float32, with format NHWC or NCHW. It is an output of BatchNormExt2.
-*@li reserve_space_2: A 4D Tensor of type float32, with format NHWC or NCHW. It is an output of BatchNormExt2 . \n
+*@li reserve_space_1: A 4D Tensor of type float32, with format NHWC or NCHW. It 
+is an output of BatchNormExt2.
+*@li reserve_space_2: A 4D Tensor of type float32, with format NHWC or NCHW. It 
+is an output of BatchNormExt2 . \n
 
 *@par Attributes:
 *@li epsilon: A required float32. A small float number added to the variance of "x".
 *@li data_format: A required string for the format.
-*@li is_training: A required bool for specifying the operation is for training (true) or inference (false) . \n
+*@li is_training: A required bool for specifying the operation is for training 
+(true) or inference (false) . \n
 
 *@par Outputs:
-*@li x_backprop: A Tensor of type float16 or float32, with format NHWC or NCHW, for the offset of "x".
-*@li scale_backprop: A Tensor of type float32, with format NHWC or NCHW, for the offset of "scale".
-*@li offset_backprop: A Tensor of type float32, with format NHWC or NCHW, for the offset of "offset".
+*@li x_backprop: A Tensor of type float16 or float32, with format NHWC or NCHW, 
+for the offset of "x".
+*@li scale_backprop: A Tensor of type float32, with format NHWC or NCHW, for 
+the offset of "scale".
+*@li offset_backprop: A Tensor of type float32, with format NHWC or NCHW, for 
+the offset of "offset".
 *@li reserve_space_3: A Tensor of type float32, with format NHWC or NCHW.
 *@li reserve_space_4: A Tensor of type float32, with format NHWC or NCHW . \n
 
@@ -290,14 +331,18 @@ REG_OP(BatchNormGradExt2)
 *@brief Performs batch normalization . \n
 
 *@par Inputs:
-*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
-*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x"  Specifies the mean used for inference.
-*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x"  Specifies the variance used for inference.
+*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW 
+for 4D or NC1HWC0 for 5D.
+*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x"  
+Specifies the mean used for inference.
+*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x"  
+Specifies the variance used for inference.
 *@li momentum: A Tensor,represents the mean and the variance's scale factor
 *@li scale: An optional tensor of type float16 or float32, no use
 *@li offset: An optional tensor of type float16 or float32, no use
 *@par Attributes:
-*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.00001".
+*@li epsilon: An optional float32, specifying the small value added to variance 
+to avoid dividing by zero. Defaults to "0.00001".
 *@li use_global_stats: mean inference mode , only can be "True".
 *@li mode: An optional input, not use
 *@par Outputs:
@@ -315,16 +360,20 @@ REG_OP(BNInference)
     .ATTR(use_global_stats, Bool,true)
     .ATTR(mode, Int,1)
     .OP_END_FACTORY_REG(BNInference)
+
 /**
 *@brief aicpu batch normalization host  . \n
 
 *@par Inputs:
 
-*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x"  Specifies the mean used for inference.
-*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x"  Specifies the variance used for inference.
+*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x"  
+Specifies the mean used for inference.
+*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x"  
+Specifies the variance used for inference.
 *@li momentum: An optional float, mean and variance's Scale factor
 *@par Attributes:
-*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.00001".
+*@li epsilon: An optional float32, specifying the small value added to variance 
+to avoid dividing by zero. Defaults to "0.00001".
 *@li use_global_stats: mean inference mode , only can be "True".
 *@li mode: An optional attr, not use
 *@par Outputs:
@@ -348,14 +397,19 @@ REG_OP(BnHost)
 *@brief Performs batch normalization . \n
 
 *@par Inputs:
-*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW for 4D or NC1HWC0 for 5D.
-*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x" Specifies the mean used for inference.
-*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x" Specifies the variance used for inference.
+*@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC or NCHW 
+for 4D or NC1HWC0 for 5D.
+*@li mean: A Tensor of type float32 or float16. Must be 1D if input "x" 
+Specifies the mean used for inference.
+*@li variance: A Tensor of type float32 or float16 . Must be 1D if input "x" 
+Specifies the variance used for inference.
 *@li scale: An optional tensor of type float16 or float32, no use
 *@li offset: An optional tensor of type float16 or float32, no use
 *@par Attributes:
-*@li momentum: An optional float32 num, represents the mean and the variance's scale factor
-*@li epsilon: An optional float32, specifying the small value added to variance to avoid dividing by zero. Defaults to "0.00001".
+*@li momentum: An optional float32 num, represents the mean and the variance's 
+scale factor
+*@li epsilon: An optional float32, specifying the small value added to variance 
+to avoid dividing by zero. Defaults to "0.00001".
 *@li use_global_stats: mean inference mode , only can be "True".
 *@li mode: An optional attr, not use
 *@par Outputs:
diff --git a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
index 12412516..6307889d 100644
--- a/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_calculation_ops.h
@@ -310,9 +310,6 @@ REG_OP(DepthwiseConv2DBackpropInputD)
 * @par Third-party framework compatibility
 * @li Compatible with the TensorFlow operator DepthwiseConv2D.
 * @li Compatible with the Caffe operator DepthwiseConv2D.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(DepthwiseConv2D)
     .INPUT(x, TensorType({DT_FLOAT16, DT_INT8}))
@@ -460,9 +457,9 @@ REG_OP(Conv2DBackpropInputD)
 *@par Attributes:
  * Six attributes:
  * @li strides: A tuple or list of 2 integers. The stride of the sliding window
- * for H/W dimension.
+ * for H/W dimension, defaults to [1,1].
  * @li pads: A tuple or list of 4 integers. The [top, bottom, left, right]
- * padding on the feature map.
+ * padding on the feature map, defaults to [0,0,0,0].
  * @li dilations: A tuple or list of 4 integers. The dilation factor for each
  * dimension of input, defaults to [1,1,1,1].
  * @li groups: Number of blocked connections from input channels to
@@ -482,8 +479,8 @@ REG_OP(Deconvolution)
     .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_INT32}))
     .OPTIONAL_INPUT(offset_w, TensorType({DT_INT8}))
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_INT32}))
-    .REQUIRED_ATTR(strides, ListInt)
-    .REQUIRED_ATTR(pads, ListInt)
+    .ATTR(strides, ListInt, {1, 1})
+    .ATTR(pads, ListInt, {0, 0, 0, 0})
     .ATTR(dilations, ListInt, {1, 1, 1, 1})
     .ATTR(groups, Int, 1)
     .ATTR(data_format, String, "NCHW")
@@ -593,7 +590,7 @@ REG_OP(Conv2DBackpropFilterD)
 
 *@li bias: An optional 1D tensor. Shape is [out_channels].
 *@li offset_w: An optional 1D tensor for quantized convolution. Shape is
-* [out_channels]. Reserved.
+* [out_channels]. Not supported.
 *\n
 *\n
 * Note that there is a strict data type mapping between the input and output
@@ -622,7 +619,8 @@ REG_OP(Conv2DBackpropFilterD)
 * and right padding.
 * @li dilations: Optional. A list of 4 integers. Specifying the dilation rate
 * to use for dilated convolution. Has the same dimension order and value as
-* "strides". Defaults to [1, 1, 1, 1].
+* "strides". Dilation > 1 is not supported for quantized convolution. Defaults
+* to [1, 1, 1, 1].
 * @li groups: Optional. An integer of type int32, for the number of blocked
 * connections from input channels to output channels. Input channels and output
 * channels must both be divisible by "groups". "x" in_channels must be equal to
@@ -704,13 +702,62 @@ REG_OP(Conv2D)
     .ATTR(offset_x, Int, 0)
     .OP_END_FACTORY_REG(Conv2D)
 
+/**
+*@brief Computes a 2D convolution given 4D "x" and "filter_compress" tensors.
+*@par Inputs:
+* @li x: A 4D tensor of input images.
+* @li filter_compress: A 4D tensor of compressed filters.
+* @li compress_index: A 1D Tensor dtype of int8.
+* @li bias: An optional 1D tensor.
+* @li offset_w: An optional 1D tensor for quantized convolution. Reserved.
+*
+* The input and output tensor attributes are listed as follows:
+* @verbatim
+    |Tensor    | x       | filter_compress  | bias    | offset_w | y
+    -----------|---------|---------|---------|----------|--------
+    |Data Type | float16 | float16 | float16 | _        | float16
+    |          |---------|---------|---------|----------|--------
+    |          | float32 | float32 | float32 | _        | float32
+    |          |---------|---------|---------|----------|--------
+    |          | int8    | int8    | int32   | int8     | int32
+    -----------|---------|---------|---------|----------|--------
+    |Format    | NCHW    | NCHW    | ND      | ND       | NCHW
+    |          | NHWC    | NHWC    |         |          | NHWC
+    |          |         | HWCN    |         |          |
+@endverbatim
+* It should be noted that the data types must correspond to each other, but the
+* format does not need to . \n
+
+*@par Attributes:
+* @li strides: A list of 4 integers. Specifying the strides of the
+* convolution along the height and width. The dimension order is determined
+* by the data format of "x". By default the N and C dimensions are set to 1.
+* @li pads: A list of 4 integers. Specifying the top, bottom, left and right
+* padding.
+* @li dilations: A list of 4 integers. Specifying the dilation rate to use
+* for dilated convolution. Has the same dimension order and value as "strides".
+* @li groups: Number of blocked connections from input channels to output
+* channels. Input channels and output channels must both be divisible by
+* "groups".Type is int32.
+* @li offset_x: An optional integer for quantized convolution. Type is int32.
+* Defaults to "0".
+* @li data_format: An optional string from: "NHWC", "NCHW". Specifying the
+* data format of the input and output images. Type is string.
+* Defaults to "NHWC". Reserved . \n
+
+*@par Outputs:
+* @li y: A 4D Tensor of output images . \n
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS DEPRECATED.
+*/
 REG_OP(Conv2DCompress)
-    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8}))
-    .INPUT(filter_compress, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT8}))
+    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8}))
+    .INPUT(filter_compress, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8}))
     .INPUT(compress_index, TensorType({DT_INT8}))
-    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32}))
+    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
     .OPTIONAL_INPUT(offset_w, TensorType({DT_INT8}))
-    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32}))
+    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
     .REQUIRED_ATTR(strides, ListInt)
     .REQUIRED_ATTR(pads, ListInt)
     .ATTR(dilations, ListInt, {1, 1, 1, 1})
diff --git a/third_party/fwkacllib/inc/ops/nn_detect_ops.h b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
index 415cc4ef..bd8bb9bf 100644
--- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
@@ -158,18 +158,25 @@ REG_OP(Iou)
 *@par Inputs:
 * Three inputs, including:
 *@li ydiff: A 5HD gradient input of type float32.
-*@li rois: ROI position. A 2D Tensor of float32 with shape (N, 5). "N" indicates the number of ROIs,
-the value "5" indicates the indexes of images where the ROIs are located, "x0", "x1", "y0", and "y1".
-*@li rois_n: An optional input, specifying the number of valid ROIs. This parameter is reserved . \n
+*@li rois: ROI position. A 2D Tensor of float32 with shape (N, 5). "N" 
+indicates the number of ROIs,
+the value "5" indicates the indexes of images where the ROIs are located, "x0", 
+"x1", "y0", and "y1".
+*@li rois_n: An optional input, specifying the number of valid ROIs. This 
+parameter is reserved . \n
 
 *@par Attributes:
 *@li xdiff_shape: A required list of 4 ints, obtained based on the shape of "features" of ROIAlign.
 *@li pooled_width: A required attribute of type int, specifying the W dimension.
 *@li pooled_height: A required attribute of type int, specifying the H dimension.
-*@li spatial_scale: A required attribute of type float, specifying the scaling ratio of "features" to the original image.
-*@li sample_num: An optional attribute of type int, specifying the horizontal and vertical
-sampling frequency of each output. If this attribute is set to "0", the sampling frequency is
-equal to the rounded up value of "rois", which is a floating point number. Defaults to "2" . \n
+*@li spatial_scale: A required attribute of type float, specifying the scaling 
+ratio of "features" to the original image.
+*@li sample_num: An optional attribute of type int, specifying the horizontal 
+and vertical
+sampling frequency of each output. If this attribute is set to "0", the 
+sampling frequency is
+equal to the rounded up value of "rois", which is a floating point number. 
+Defaults to "2" . \n
 
 *@par Outputs:
 *xdiff: Gradient added to input "features". Has the same 5HD shape as input "features".
@@ -876,9 +883,7 @@ REG_OP(YoloV3DetectionOutputV2)
 A Yolo operator has three outputs: "coords", "obj", and "class". For details, see the description of operator Yolo.
 *@li imginfo: A float16, describing the image information including the required image height and width
 and the actual image height and width.
-*@li windex: A windex tensor with shape [height,weight]. Has the same type as the inputs.
-[[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)]
-is formed for the three Yolo outputs, respectively .It's a dynamic input. \n
+*@li windex: A windex tensor with shape [height,weight]. Has the same type as the inputs. [[0,1,2...(weight-1)],[0,1,2...(w-1)]...[0,1,2...(weight-1)]] consisting of h groups of [0, 1, 2...(weight-1)] is formed for the three Yolo outputs, respectively . \n
 
 *@li hindex: A hindex tensor with shape [height,weight]. Has the same type as the inputs. [[0,0...0],[1,1...1],[2,2...2]...[height-1,height-1...,height-1]] is formed for the three Yolo outputs, respectively . \n
 
diff --git a/third_party/fwkacllib/inc/ops/nn_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
index 14949c54..0d0032cf 100644
--- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
@@ -896,29 +896,7 @@ REG_OP(InstanceNormV2)
     .ATTR(epsilon, Float, 0.00001)
     .OP_END_FACTORY_REG(InstanceNormV2)
 
-/**
-*@brief Performs instance normalization for inference.
-
-*@par Inputs:\n
-* Five inputs, including: (NC1HWC0 supported)
-*@li x: A Tensor of type float16 or float32.
-*@li gamma: A [N, C1, 1, 1, C0] Tensor of type float32, for the scaling gamma.
-*@li beta: A [N, C1, 1, 1, C0] Tensor of type float32, for the scaling beta.
-*@li mean: A [N, C1, 1, 1, C0] ensor of type float32, for the mean.
-*@li variance: A [N, C1, 1, 1, C0] Tensor of type float32, for the variance.
-*@li variance_sqrt: A [N, C1, 1, 1, C0] Tensor of type float32, for the variance_sqrt.
-
-*@par Outputs:\n
-*y: A Tensor of type float16 or float32 for the normalized "x".
-*batch_mean: A Tensor of type float32 for the result mean.
-*batch_ variance: A Tensor of type float32 for the result variance.
 
-*@attention Constraints:
-*For Ascend 310, the result accuracy fails to reach 1<89> due to the square root instruction.
-
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use INInferV2 instead.
-*/
 REG_OP(INInferV2D)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
     .OPTIONAL_INPUT(gamma, TensorType({DT_FLOAT}))
@@ -931,6 +909,20 @@ REG_OP(INInferV2D)
     .OUTPUT(batch_variance, TensorType({DT_FLOAT}))
     .OP_END_FACTORY_REG(INInferV2D)
 
+/**
+*@brief Performs instance normalization for inference of InHost part.
+
+*@par Inputs:\n
+* One input, including: (NC1HWC0 supported)
+* variance: A [N, C1, 1, 1, C0] Tensor of type float32, for the variance.
+
+*@par Attributes:
+* epsilon: An optional float32, specifying the small value added to
+variance to avoid dividing by zero. Defaults to "0.00001" . \n
+
+*@par Outputs:\n
+* variance_sqrt: A [N, C1, 1, 1, C0] Tensor of type float32, for the variance_sqrt.
+*/
 REG_OP(InHost)
      .INPUT(variance, TensorType({DT_FLOAT}))
      .OUTPUT(variance_sqrt, TensorType({DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
index 5d3cd931..fb7fc127 100644
--- a/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_pooling_ops.h
@@ -128,9 +128,6 @@ REG_OP(AvgPool)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator AvgPool3D.
-*
-* @par Restrictions:
-*Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
 */
 REG_OP(AvgPool3D)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT32, DT_DOUBLE}))
diff --git a/third_party/fwkacllib/inc/ops/nn_training_ops.h b/third_party/fwkacllib/inc/ops/nn_training_ops.h
index 65fb462e..0621a96c 100644
--- a/third_party/fwkacllib/inc/ops/nn_training_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_training_ops.h
@@ -111,9 +111,6 @@ REG_OP(ApplyAdaMax)
 *
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyAdaMax.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdaMax instead.
 */
 REG_OP(ApplyAdaMaxD)
     .INPUT(var, TensorType::NumberType())
@@ -352,9 +349,6 @@ REG_OP(ApplyMomentum)
 * accum: A mutable tensor. Has the same type as input "accum".
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyMomentum.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyMomentum instead.
 */
 
 REG_OP(ApplyMomentumD)
@@ -681,9 +675,6 @@ REG_OP(ApplyPowerSign)
 *
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyPowerSign.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyPowerSign instead.
 */
 REG_OP(ApplyPowerSignD)
     .INPUT(var, TensorType::NumberType())
@@ -804,9 +795,6 @@ REG_OP(ApplyAddSign)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ApplyAddSign.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAddSign instead.
 */
 REG_OP(ApplyAddSignD)
     .INPUT(var, TensorType::NumberType())
@@ -928,9 +916,6 @@ REG_OP(ApplyCenteredRMSProp)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyCenteredRMSPropD.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyCenteredRMSProp instead.
 */
 REG_OP(ApplyCenteredRMSPropD)
     .INPUT(var, TensorType::NumberType())
@@ -1049,9 +1034,6 @@ REG_OP(ApplyAdagrad)
 *
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyAdagrad.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdagrad instead.
 */
 REG_OP(ApplyAdagradD)
     .INPUT(var, TensorType::NumberType())
@@ -1236,9 +1218,6 @@ REG_OP(ApplyAdagradDA)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyAdagradDA.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdagradDA instead.
 */
 REG_OP(ApplyAdagradDAD)
     .INPUT(var, TensorType::NumberType())
@@ -1496,9 +1475,6 @@ REG_OP(ApplyProximalAdagrad)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyProximalAdagradD.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyProximalAdagrad instead.
 */
 REG_OP(ApplyProximalAdagradD)
     .INPUT(var, TensorType::NumberType())
@@ -1592,9 +1568,6 @@ REG_OP(SparseApplyProximalAdagrad)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator SparseApplyProximalAdagrad.
-
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use SparseApplyProximalAdagrad instead.
 */
 REG_OP(SparseApplyProximalAdagradD)
     .INPUT(var, TensorType::NumberType())
@@ -1681,9 +1654,6 @@ REG_OP(ApplyFtrl)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyFtrl.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyFtrl instead.
 */
 REG_OP(ApplyFtrlD)
     .INPUT(var, TensorType::NumberType())
@@ -1775,9 +1745,6 @@ REG_OP(ApplyFtrlV2)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyFtrlV2.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyFtrlV2 instead.
 */
 REG_OP(ApplyFtrlV2D)
     .INPUT(var, TensorType::NumberType())
@@ -1890,9 +1857,6 @@ REG_OP(ApplyAdam)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator ApplyAdam.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdam instead.
 */
 REG_OP(ApplyAdamD)
     .INPUT(var, TensorType::NumberType())
@@ -1981,9 +1945,6 @@ REG_OP(ApplyAdadelta)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ApplyAdadelta.
-
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ApplyAdadelta instead.
 */
 REG_OP(ApplyAdadeltaD)
     .INPUT(var, TensorType::NumberType())
diff --git a/third_party/fwkacllib/inc/ops/pad_ops.h b/third_party/fwkacllib/inc/ops/pad_ops.h
index 4f42008e..567bc63d 100644
--- a/third_party/fwkacllib/inc/ops/pad_ops.h
+++ b/third_party/fwkacllib/inc/ops/pad_ops.h
@@ -65,9 +65,6 @@ REG_OP(Fill)
 *
 *@par Outputs:
 * y: A tensor. Has the same type as "value".
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use Fill instead.
 */
 REG_OP(FillD)
     .INPUT(value, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16,
@@ -125,9 +122,6 @@ REG_OP(BroadcastTo)
 *
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator BroadcastTo.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use BroadcastTo instead.
 */
 REG_OP(BroadcastToD)
     .INPUT(x, TensorType::BasicType())
@@ -175,9 +169,6 @@ REG_OP(Pad)
 
 *@par Third-party framework compatibility:
 * Compatible with TensorFlow operator Pad.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use Pad instead.
 */
 REG_OP(PadD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT8, DT_UINT8, DT_FLOAT}))
@@ -269,9 +260,6 @@ REG_OP(PadV3D)
 *@see Diag()
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Diag.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use Diag instead.
 */
 REG_OP(DiagD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_INT32}))
diff --git a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
index cd6cfdfe..ec88c618 100644
--- a/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
+++ b/third_party/fwkacllib/inc/ops/ragged_conversion_ops.h
@@ -30,7 +30,7 @@ namespace ge {
 *@par Inputs:
 *Two inputs, including:
 *@li rt_nested_splits: A list of at least 1 Tensor objects with the same type
-in: int32, int64. The row_splits for the RaggedTensor. It's a dynamic input.
+in: int32, int64. The row_splits for the RaggedTensor.
 *@li rt_dense_values: A Tensor. The flat_values for the RaggedTensor
 Must be one of the following types: bool, int8, int16, uint16, int32,
 int64, double, float, float16 . \n
@@ -66,7 +66,7 @@ REG_OP(RaggedTensorToSparse)
 *@li values:A 1D tensor representing the values of the ragged tensor.
 *@li default_value:A `Tensor`. Must have the same type as `values`.
 *@li row_partition_tensors:A list of at least 1 `Tensor` objects with the same
-type in: `int64`, `int32` . It's a dynamic input.\n
+type in: `int64`, `int32` .\n
 
 *@par Attributes:
 *@li num_row_partition_tensors:Numbers of row partition tensors.
diff --git a/third_party/fwkacllib/inc/ops/random_ops.h b/third_party/fwkacllib/inc/ops/random_ops.h
index b97d824f..24a9edd1 100644
--- a/third_party/fwkacllib/inc/ops/random_ops.h
+++ b/third_party/fwkacllib/inc/ops/random_ops.h
@@ -374,9 +374,6 @@ REG_OP(DropOutGenMask)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator lin_space.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use LinSpace instead.
 */
 REG_OP(LinSpaceD)
     .INPUT(assist, TensorType({DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/reduce_ops.h b/third_party/fwkacllib/inc/ops/reduce_ops.h
index 626dda59..80169344 100644
--- a/third_party/fwkacllib/inc/ops/reduce_ops.h
+++ b/third_party/fwkacllib/inc/ops/reduce_ops.h
@@ -353,9 +353,6 @@ REG_OP(ReduceSum)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Sum.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceSum instead.
 */
 REG_OP(ReduceSumD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -381,9 +378,6 @@ REG_OP(ReduceSumD)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ReduceAll.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceAll instead.
 */
 REG_OP(ReduceAllD)
     .INPUT(x, TensorType({DT_BOOL}))
@@ -459,9 +453,6 @@ REG_OP(ReduceProd)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator ReduceProd.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceProd instead.
 */
 REG_OP(ReduceProdD)
     .INPUT(x,TensorType({DT_FLOAT, DT_UINT8, DT_INT8, DT_INT32, DT_FLOAT16}))
@@ -516,9 +507,6 @@ REG_OP(ReduceMean)
 
 *@par Third-party framework compatibility:
 * Compatible with the TensorFlow operator ReduceMean.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceMean instead.
 */
 REG_OP(ReduceMeanD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -573,9 +561,6 @@ REG_OP(ReduceMax)
 
 *@par Third-party framework compatibility
 * Compatible with TensorFlow operator Max.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceMax instead.
 */
 REG_OP(ReduceMaxD)
     .INPUT(x, TensorType({DT_FLOAT, DT_UINT8, DT_INT8,
@@ -630,9 +615,6 @@ REG_OP(ReduceMin)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator reduce_min.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceMin instead.
 */
 REG_OP(ReduceMinD)
     .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT,DT_INT8,DT_UINT8}))
@@ -699,9 +681,6 @@ REG_OP(ReduceAny)
 *
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator reduce_any.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ReduceAny instead.
 */
 REG_OP(ReduceAnyD)
     .INPUT(x, TensorType({DT_BOOL}))
@@ -787,9 +766,6 @@ REG_OP(EuclideanNorm)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator EuclideanNorm.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use EuclideanNorm instead.
 */
 REG_OP(EuclideanNormD)
     .INPUT(x, TensorType({DT_FLOAT, DT_INT32, DT_FLOAT16}))
diff --git a/third_party/fwkacllib/inc/ops/rnn.h b/third_party/fwkacllib/inc/ops/rnn.h
index e1a83f43..0766d2c6 100644
--- a/third_party/fwkacllib/inc/ops/rnn.h
+++ b/third_party/fwkacllib/inc/ops/rnn.h
@@ -92,6 +92,7 @@ REG_OP(DynamicLSTM)
     .OUTPUT(output_h, TensorType({DT_FLOAT32}))
     .OP_END_FACTORY_REG(DynamicLSTM)
 
+
 /**
 *@brief: DynamicRNNGrad calculation.
 *@par Inputs:
@@ -126,7 +127,7 @@ REG_OP(DynamicLSTM)
 *@li keep_prob:An float identifying the keep prob in the op. Default to 1.
 *@li cell_clip:An float identifying the cell clip in the op. Default to -1.
 *@li num_proj:An integer identifying the num projection in the op. Default to 0.
-*@li time_major:An bool identifying the time major in the op. Default to false.
+*@li time_major:An bool identifying the time major in the op. Default to true.
 *@li activation:An string identifying the type of activation function in the op. Default to "tanh". Only tanh is currently supported.
 *@li forget_bias:An float identifying the forget bias in the op. Default to 0.
 *@li is_training:An bool identifying is training in the op. Default to true.
@@ -138,6 +139,9 @@ REG_OP(DynamicLSTM)
 *@li dx:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dh_prev:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 *@li dc_prev:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dwci:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dwcf:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
+*@li dwco:A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ.
 */
 REG_OP(DynamicRNNGrad)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/save_ops.h b/third_party/fwkacllib/inc/ops/save_ops.h
index 159e7382..7fd853d3 100644
--- a/third_party/fwkacllib/inc/ops/save_ops.h
+++ b/third_party/fwkacllib/inc/ops/save_ops.h
@@ -28,7 +28,7 @@ namespace ge {
 /**
 *@brief Mark which tensors need to be saved to the ckpt file.
 *@par Inputs:
-*tensors: A list of input tensor.It's a dynamic input.
+*tensors: A list of input tensor.
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
diff --git a/third_party/fwkacllib/inc/ops/sdca_ops.h b/third_party/fwkacllib/inc/ops/sdca_ops.h
index dc6852d4..acf1c34d 100644
--- a/third_party/fwkacllib/inc/ops/sdca_ops.h
+++ b/third_party/fwkacllib/inc/ops/sdca_ops.h
@@ -35,16 +35,16 @@ namespace ge {
 *rate . \n
 
 *@par Inputs:
-*@li sparse_example_indices: a list of vectors which contain example indices.It's a dynamic input.
-*@li sparse_feature_indices: a list of vectors which contain feature indices.It's a dynamic input.
-*@li sparse_feature_values: a list of vectors which contains feature value associated with each feature group.It's a dynamic input.
-*@li dense_features: a list of matrices which contains the dense feature values.It's a dynamic input.
+*@li sparse_example_indices: a list of vectors which contain example indices.
+*@li sparse_feature_indices: a list of vectors which contain feature indices.
+*@li sparse_feature_values: a list of vectors which contains feature value associated with each feature group.
+*@li dense_features: a list of matrices which contains the dense feature values.
 *@li example_weights: a vector which contains the weight associated with each example.
 *@li example_labels: a vector which contains the label/target associated with each example.
 *@li sparse_indices: a list of vectors where each value is the indices which has
-*corresponding weights in sparse_weights. This field maybe omitted for the dense approach.It's a dynamic input.
+*corresponding weights in sparse_weights. This field maybe omitted for the dense approach.
 *@li sparse_weights: a list of vectors where each value is the weight associated with a sparse feature group.
-*@li dense_weights: a list of vectors where the values are the weights associated with a dense feature group.It's a dynamic input.
+*@li dense_weights: a list of vectors where the values are the weights associated with a dense feature group.
 *@li example_state_data: a list of vectors containing the example state data.
 *@li loss_type: Type of the primal loss. Currently SdcaSolver supports logistic, squared and hinge losses.
 *@li l1: Symmetric l1 regularization strength.
@@ -61,7 +61,6 @@ namespace ge {
 *@par Third-party framework compatibility
 * Compatible with tensorflow SdcaOptimizerV2 operator.
 */
-
 REG_OP(SdcaOptimizerV2)
     .DYNAMIC_INPUT(sparse_example_indices, TensorType({DT_INT64}))
     .DYNAMIC_INPUT(sparse_feature_indices, TensorType({DT_INT64}))
diff --git a/third_party/fwkacllib/inc/ops/selection_ops.h b/third_party/fwkacllib/inc/ops/selection_ops.h
index 613ce358..8ef4a42c 100644
--- a/third_party/fwkacllib/inc/ops/selection_ops.h
+++ b/third_party/fwkacllib/inc/ops/selection_ops.h
@@ -79,9 +79,6 @@ REG_OP(Range)
 
 *@see Range()
 *@since V100R001C33
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use Range instead.
 */
 REG_OP(RangeD)
     .INPUT(x, TensorType({DT_FLOAT,DT_INT32}))
@@ -186,7 +183,8 @@ REG_OP(GatherNd)
 *     uint8, int16, int8, int64, qint8, quint8, qint32, qint16, quint16,
 *     uint16, complex128, float16, uint32, uint64, complex64, complex128.
 * @li indices: A Tensor of type int32 or int64.
-* @li axis: A Tensor of type as int32 . \n
+* @li axis: A Tensor of type as int32 or int64,
+*     Must be in the range [-rank(input_tensor), rank(input_tensor)) . \n
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n
@@ -225,9 +223,6 @@ REG_OP(GatherV2)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator GatherV2.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use GatherV2 instead.
 */
 REG_OP(GatherV2D)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_UINT32, DT_INT8, DT_UINT8,
@@ -330,9 +325,6 @@ REG_OP(StridedSlice)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator StridedSlice.
-
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use StridedSlice instead.
 */
 REG_OP(StridedSliceD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_UINT8, DT_INT8,
@@ -388,9 +380,6 @@ REG_OP(StridedSliceD)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator StridedSliceGradD.
-
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use StridedSliceGrad instead.
 */
 REG_OP(StridedSliceGradD)
     .INPUT(dy, TensorType::BasicType())
@@ -502,9 +491,6 @@ REG_OP(UnsortedSegmentSum)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator UnsortedSegmentSum.
-
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use UnsortedSegmentSum instead.
 */
 REG_OP(UnsortedSegmentSumD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT8, DT_UINT8}))
@@ -729,9 +715,6 @@ REG_OP(OneHot)
 
 *@par Third-party framework compatibility:
 * Compatible with the TensorFlow operator OneHot.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use OneHot instead.
 */
 REG_OP(OneHotD)
     .INPUT(x, TensorType({DT_UINT8, DT_INT32}))
@@ -807,7 +790,7 @@ REG_OP(SliceD)
 * @li assist_seq: A 1D tensor of type float16.
 * with size of 2N, which "N" is the last dimension.
 * The first N numbers is indices, and the next N numbers is deviation of casting
-* int32 to float16. \n
+* float16 to int32 . \n
 
 * @par Attributes:
 * @li k: A required int that is at least 0, specifying the number of top elements
@@ -816,7 +799,7 @@ REG_OP(SliceD)
 * If true, the resulting "k" elements will be sorted by the values in descending
 * order.
 * @li dim: An optional int. Defaults to -1. For reserved use.
-* @li largest: An optional bool. Defaults to true. For reserved use. \n
+* @li largest: An optional bool. Defaults to true. For reserved use.
 
 * @par Outputs:
 * @li values: A Tensor, specifying the sorted data. Has the same type as "input".
@@ -1270,9 +1253,6 @@ REG_OP(InplaceUpdate)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator InplaceUpdate.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use InplaceUpdate instead.
 */
 REG_OP(InplaceUpdateD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
@@ -1325,9 +1305,6 @@ REG_OP(InplaceAdd)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator InplaceAdd.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use InplaceAdd instead.
 */
 REG_OP(InplaceAddD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
@@ -1379,9 +1356,6 @@ REG_OP(InplaceSub)
 
 *@par Third-party framework compatibility
 *Compatible with the TensorFlow operator InplaceSub.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use InplaceSub instead.
 */
 REG_OP(InplaceSubD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
@@ -1433,9 +1407,6 @@ REG_OP(ScatterNonAliasingAdd)
 * @par Outputs:
 * y: A Tensor of type RealNumberType . \n
 
-* @attention Constraints:
-* @li segment_ids must be non-negative tensor.
-
 * @see UnsortedSegmentSum(), UnsortedSegmentProd(),
 
 * @par Third-party framework compatibility
@@ -1463,9 +1434,6 @@ REG_OP(UnsortedSegmentMin)
 * @par Outputs:
 * y: A Tensor.Must have the same type as input "x" . \n
 
-* @attention Constraints:
-* @li segment_ids must be non-negative tensor.
-
 * @see UnsortedSegmentProdD(), UnsortedSegmentSumD(),
 *
 * @par Restrictions:
@@ -1491,9 +1459,6 @@ REG_OP(UnsortedSegmentMinD)
 * @par Outputs:
 * y: A Tensor of type RealNumberType . \n
 
-* @attention Constraints:
-* @li segment_ids must be non-negative tensor.
-
 * @see UnsortedSegmentSum(), UnsortedSegmentProd(),
 
 * @par Third-party framework compatibility
@@ -1521,9 +1486,6 @@ REG_OP(UnsortedSegmentMax)
 * @par Outputs:
 * y: A Tensor.Must have the same type as input "x" . \n
 
-* @attention Constraints:
-* @li segment_ids must be non-negative tensor.
-
 * @see UnsortedSegmentProdD(),
 *
 * @par Restrictions:
@@ -1548,9 +1510,6 @@ REG_OP(UnsortedSegmentMaxD)
 * @par Outputs:
 * y: A Tensor of type NumberType . \n
 
-* @attention Constraints:
-* @li segment_ids must be non-negative tensor.
-
 * @see UnsortedSegmentSum(), UnsortedSegmentMin(),
 
 * @par Third-party framework compatibility
@@ -1582,9 +1541,6 @@ REG_OP(UnsortedSegmentProd)
 * @li segment_ids must be non-negative tensor.
 
 * @see UnsortedSegmentMinD()
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use UnsortedSegmentProd instead.
 */
 REG_OP(UnsortedSegmentProdD)
     .INPUT(x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32, DT_INT16}))
@@ -1900,9 +1856,6 @@ REG_OP(CumulativeLogsumexp)
 *y: A Tensor. Has the same type as "x".
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Cumsum.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use CumulativeLogsumexp instead.
 */
 REG_OP(CumulativeLogsumexpD)
     .INPUT(x, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16}))
diff --git a/third_party/fwkacllib/inc/ops/split_combination_ops.h b/third_party/fwkacllib/inc/ops/split_combination_ops.h
index b0bd14c0..b66a0213 100644
--- a/third_party/fwkacllib/inc/ops/split_combination_ops.h
+++ b/third_party/fwkacllib/inc/ops/split_combination_ops.h
@@ -75,9 +75,6 @@ REG_OP(Split)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator Split.
-
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use Split instead.
 */
 REG_OP(SplitD)
     .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8,
@@ -144,9 +141,6 @@ Under the caffe framework, the conversion of slice_point through the cut point t
 Under the caffe framework,size_splits or axis transformat to split_dim.Only one can effect.
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator SplitV.
-
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use SplitV instead.
 */
 REG_OP(SplitVD)
     .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8,
@@ -164,8 +158,7 @@ REG_OP(SplitVD)
 * Two inputs, including:
 * @li values: A list of Tensors. Must be one of the following types: int8, int16, int32,
 *     int64, uint8, uint16, uint32, uint64, float16, float32.
-*     Tensors to be concatenated. All must have size 1 in the first dimension and same shape.
-*     It's a dynamic input.
+*     Tensors to be concatenated. All must have size 1 in the first dimension and same shape. 
 * @li shape: A Tensor of the same type as "x".
 * The final shape of the result. Should be equal to the shapes of any input
 * but with the number of input values in the first dimension . \n
@@ -314,7 +307,7 @@ REG_OP(Concat)
 
 *@par Inputs:
 * x: A list of N Tensors. Must be one of the following types: int8, int16, int32,
-*     int64, uint8, uint16, uint32, uint64, float16, float32, bool . It's a dynamic input. \n
+*     int64, uint8, uint16, uint32, uint64, float16, float32, bool . \n
 
 *@par Attributes:
 *@li axis: A optional int, defaultvalue is 0.
@@ -340,7 +333,7 @@ REG_OP(Pack)
 *@par Inputs:
 *Two inputs, including:
 * @li concat_dim: A Tensor of type int32.
-* @li x: A list of 1D Tensor objects of type int32 . It's a dynamic input. \n
+* @li x: A list of 1D Tensor objects of type int32 . \n
 
 *@par Attributes:
 *N: A required int . \n
@@ -364,7 +357,7 @@ REG_OP(ConcatOffset)
 *@par Inputs:
 *Two inputs, including:
 * @li concat_dim: A Tensor of type int32.
-* @li x: A list of 1D Tensor objects of type int32 . It's a dynamic input. \n
+* @li x: A list of 1D Tensor objects of type int32 . \n
 
 *@par Attributes:
 *@li Concat_dim: A required int. Must be within the rank of input "x".
diff --git a/third_party/fwkacllib/inc/ops/transformation_ops.h b/third_party/fwkacllib/inc/ops/transformation_ops.h
index edc55820..ed46d95c 100644
--- a/third_party/fwkacllib/inc/ops/transformation_ops.h
+++ b/third_party/fwkacllib/inc/ops/transformation_ops.h
@@ -235,12 +235,8 @@ REG_OP(BatchToSpaceND)
 *@par Outputs:
 *y: A Tensor with format NC1HWC0. Has the same type as input "x".
 
-
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator BatchToSpaceND.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use BatchToSpaceND instead.
 */
 REG_OP(BatchToSpaceNDD)
     .INPUT(x, TensorType::BasicType())
@@ -287,9 +283,6 @@ REG_OP(SpaceToBatchND)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator SpaceToBatchND.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use SpaceToBatchND instead.
 */
 REG_OP(SpaceToBatchNDD)
     .INPUT(x, TensorType::BasicType())
@@ -411,9 +404,6 @@ REG_OP(BatchToSpace)
 
 *@par Third-party framework compatibility
 * Compatible with the TensorFlow operator BatchToSpace.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use BatchToSpace instead.
 */
 REG_OP(BatchToSpaceD)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT64, DT_INT32, DT_UINT8,
@@ -467,9 +457,6 @@ REG_OP(SpaceToBatch)
 *y: A Tensor. Has the same type as input "x".
 *@par Third-party framework compatibility
 *@ Compatible with the TensorFlow operator SpaceToBatch.
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use SpaceToBatch instead.
 */
 REG_OP(SpaceToBatchD)
     .INPUT(x, TensorType::BasicType())
@@ -598,9 +585,6 @@ REG_OP(ExtractVolumePatches)
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x".
-*
-* @par Restrictions:
-* Warning: THIS FUNCTION IS DEPRECATED. Please use ConfusionTranspose instead.
 */
 REG_OP(ConfusionTransposeD)
     .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_UINT8,
@@ -664,11 +648,6 @@ REG_OP(FlattenV2)
     .ATTR(end_axis, Int, -1)
     .OP_END_FACTORY_REG(FlattenV2)
 
-REG_OP(DeConvTrans)
-    .INPUT(x, TensorType({DT_INT8}))
-    .OUTPUT(y, TensorType({DT_INT8}))
-    .OP_END_FACTORY_REG(DeConvTrans)
-
 /**
 *@brief Compress large weight to small one. Usually inserted before Conv2d.
 *
diff --git a/third_party/fwkacllib/inc/runtime/base.h b/third_party/fwkacllib/inc/runtime/base.h
index 4b08916e..17243802 100644
--- a/third_party/fwkacllib/inc/runtime/base.h
+++ b/third_party/fwkacllib/inc/runtime/base.h
@@ -19,7 +19,7 @@
 
 #include <stdint.h>
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
 #endif
 
@@ -580,7 +580,8 @@ RTS_API rtError_t rtLabelListCpy(rtLabel_t *label, uint32_t labelNumber, void *d
  * @return RT_ERROR_INVALID_VALUE for error input
  */
 RTS_API rtError_t rtLabelCreateEx(rtLabel_t *label, rtStream_t stream);
-#ifdef __cplusplus
+
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 
diff --git a/third_party/fwkacllib/inc/runtime/config.h b/third_party/fwkacllib/inc/runtime/config.h
index c64ed16f..6de84c02 100644
--- a/third_party/fwkacllib/inc/runtime/config.h
+++ b/third_party/fwkacllib/inc/runtime/config.h
@@ -19,7 +19,7 @@
 
 #include "base.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
 #endif
 
@@ -185,7 +185,7 @@ RTS_API rtError_t rtSetPlatformType(rtPlatformType_t platformType);
  */
 RTS_API rtError_t rtMemGetL2Info(rtStream_t stream, void **ptr, uint32_t *size);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 
diff --git a/third_party/fwkacllib/inc/runtime/context.h b/third_party/fwkacllib/inc/runtime/context.h
index cc74a5ed..39651817 100644
--- a/third_party/fwkacllib/inc/runtime/context.h
+++ b/third_party/fwkacllib/inc/runtime/context.h
@@ -19,7 +19,7 @@
 
 #include "base.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
 #endif
 
@@ -149,7 +149,7 @@ RTS_API rtError_t rtGetGroupInfo(int32_t groupId, rtGroupInfo_t* groupInfo, uint
  */
 RTS_API rtError_t rtGetGroupCount(uint32_t *count);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 
diff --git a/third_party/fwkacllib/inc/runtime/dev.h b/third_party/fwkacllib/inc/runtime/dev.h
index 048be69a..0bff548b 100644
--- a/third_party/fwkacllib/inc/runtime/dev.h
+++ b/third_party/fwkacllib/inc/runtime/dev.h
@@ -19,7 +19,7 @@
 
 #include "base.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
 #endif
 
@@ -339,7 +339,7 @@ RTS_API rtError_t rtGetPairDevicesInfo(uint32_t devId, uint32_t otherDevId, int3
  * @return RT_ERROR_NONE for ok
  */
 RTS_API rtError_t rtGetRtCapability(rtFeatureType_t featureType, int32_t featureInfo, int64_t *value);
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 
diff --git a/third_party/fwkacllib/inc/runtime/dvfsprofile.h b/third_party/fwkacllib/inc/runtime/dvfsprofile.h
index 60f400b3..e27cd832 100644
--- a/third_party/fwkacllib/inc/runtime/dvfsprofile.h
+++ b/third_party/fwkacllib/inc/runtime/dvfsprofile.h
@@ -19,7 +19,7 @@
 
 #include "base.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
 #endif
 
@@ -56,7 +56,7 @@ RTS_API rtError_t rtUnsetDvfsProfile();
  */
 RTS_API rtError_t rtGetDvfsProfile(DvfsProfileMode *pmode);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 
diff --git a/third_party/fwkacllib/inc/runtime/event.h b/third_party/fwkacllib/inc/runtime/event.h
index 9dc44766..af7b16d8 100644
--- a/third_party/fwkacllib/inc/runtime/event.h
+++ b/third_party/fwkacllib/inc/runtime/event.h
@@ -19,7 +19,7 @@
 
 #include "base.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
 #endif
 
@@ -229,7 +229,7 @@ RTS_API rtError_t rtNotifyGetAddrOffset(rtNotify_t notify, uint64_t *devAddrOffs
  */
 RTS_API rtError_t rtSetIpcNotifyPid(const char *name, int32_t pid[], int num);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 
diff --git a/third_party/fwkacllib/inc/runtime/kernel.h b/third_party/fwkacllib/inc/runtime/kernel.h
index 956e033b..2030634a 100644
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@@ -20,7 +20,7 @@
 #include "base.h"
 #include "stream.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
 #endif
 
@@ -529,7 +529,7 @@ RTS_API rtError_t rtStopOnlineProf(rtStream_t stream);
  * @return RT_ERROR_INVALID_VALUE for error input 
  */
 RTS_API rtError_t rtGetOnlineProfData(rtStream_t stream, rtProfDataInfo_t *pProfData, uint32_t profDataNum);
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 
diff --git a/third_party/fwkacllib/inc/runtime/mem.h b/third_party/fwkacllib/inc/runtime/mem.h
index 8c1a4326..a506e94a 100644
--- a/third_party/fwkacllib/inc/runtime/mem.h
+++ b/third_party/fwkacllib/inc/runtime/mem.h
@@ -24,7 +24,7 @@
 #include "config.h"
 #include "stream.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
 #endif
 
@@ -491,7 +491,7 @@ RTS_API rtError_t rtSetIpcMemPid(const char *name, int32_t pid[], int num);
  */
 RTS_API rtError_t rtRDMADBSend(uint32_t dbIndex, uint64_t dbInfo, rtStream_t stream);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 
diff --git a/third_party/fwkacllib/inc/runtime/rt_model.h b/third_party/fwkacllib/inc/runtime/rt_model.h
index 089a90b7..59a1ba7d 100644
--- a/third_party/fwkacllib/inc/runtime/rt_model.h
+++ b/third_party/fwkacllib/inc/runtime/rt_model.h
@@ -19,7 +19,7 @@
 
 #include "base.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
 #endif
 
@@ -430,7 +430,7 @@ rtError_t rtDebugRegister(rtModel_t model, uint32_t flag, const void *addr, uint
  */
 RTS_API rtError_t rtDebugUnRegister(rtModel_t model);
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 
diff --git a/third_party/fwkacllib/inc/runtime/stream.h b/third_party/fwkacllib/inc/runtime/stream.h
index 3123c3a9..ab542d89 100644
--- a/third_party/fwkacllib/inc/runtime/stream.h
+++ b/third_party/fwkacllib/inc/runtime/stream.h
@@ -20,7 +20,7 @@
 #include "base.h"
 #include "event.h"
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 extern "C" {
 #endif
 
@@ -188,7 +188,7 @@ RTS_API rtError_t rtStreamActive(rtStream_t active_stream, rtStream_t stream);
  */
 RTS_API rtError_t rtStreamSwitchN(void *ptr, uint32_t size, void *valuePtr, rtStream_t *trueStreamPtr,
                                   uint32_t elementSize, rtStream_t stream, rtSwitchDataType_t dataType);
-#ifdef __cplusplus
+#if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
 }
 #endif
 
diff --git a/third_party/fwkacllib/inc/toolchain/adx_datadump_server.h b/third_party/fwkacllib/inc/toolchain/adx_datadump_server.h
index a1c39a51..67adecd9 100644
--- a/third_party/fwkacllib/inc/toolchain/adx_datadump_server.h
+++ b/third_party/fwkacllib/inc/toolchain/adx_datadump_server.h
@@ -1,12 +1,18 @@
 /**
-* @file adx_datadump_server.h
-*
-* Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved.
-*
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-*/
+ * Copyright 2019-2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #ifndef ADX_DATADUMP_SERVER_H
 #define ADX_DATADUMP_SERVER_H