From 227571fc75a951344be05cc151602a70b0cf9a01 Mon Sep 17 00:00:00 2001
From: unknown <zhaozhixuan2@hisilicon.com>
Date: Thu, 28 Jan 2021 21:23:18 +0800
Subject: [PATCH] multi_task for single_op.

---
 ge/CMakeLists.txt                                  |   9 --
 ge/hybrid/common/tensor_value.cc                   |   2 +-
 ge/hybrid/executor/hybrid_model_executor.cc        |   4 +-
 ge/hybrid/executor/subgraph_executor.cc            |  45 ++++++-
 ge/hybrid/executor/subgraph_executor.h             |  15 ++-
 ge/hybrid/model/hybrid_model.cc                    |   8 +-
 ge/hybrid/model/hybrid_model.h                     |   3 +-
 ge/hybrid/model/hybrid_model_builder.cc            | 133 +++++++++++++++++----
 ge/hybrid/model/hybrid_model_builder.h             |   2 +
 ge/hybrid/node_executor/task_context.cc            |   6 +-
 ge/single_op/single_op.cc                          |  21 +++-
 ge/single_op/single_op.h                           |   5 +-
 ge/single_op/single_op_model.cc                    |  36 ++++++
 ge/single_op/stream_resource.cc                    |   4 +
 ge/single_op/stream_resource.h                     |   1 +
 tests/ut/ge/CMakeLists.txt                         |  40 +++++++
 .../ut/ge/single_op/single_op_manager_unittest.cc  |   1 -
 17 files changed, 290 insertions(+), 45 deletions(-)

diff --git a/ge/CMakeLists.txt b/ge/CMakeLists.txt
index 16494a33..be47c8dd 100755
--- a/ge/CMakeLists.txt
+++ b/ge/CMakeLists.txt
@@ -639,15 +639,6 @@ set(INFER_SRC_LIST
     "graph/load/model_manager/task_info/model_exit_task_info.cc"
     "graph/load/model_manager/task_info/super_kernel/super_kernel_factory.cc"
     "graph/load/model_manager/task_info/super_kernel/super_kernel.cc"
-    "single_op/task/op_task.cc"
-    "single_op/task/build_task_utils.cc"
-    "single_op/task/tbe_task_builder.cc"
-    "single_op/task/aicpu_task_builder.cc"
-    "single_op/task/aicpu_kernel_task_builder.cc"
-    "single_op/single_op.cc"
-    "single_op/single_op_model.cc"
-    "single_op/stream_resource.cc"
-    "single_op/single_op_manager.cc"
     "hybrid/hybrid_davinci_model_stub.cc"
     "ir_build/ge_ir_build.cc"
     "ir_build/atc_ir_common.cc"
diff --git a/ge/hybrid/common/tensor_value.cc b/ge/hybrid/common/tensor_value.cc
index 16ecfaa4..c691c6f3 100644
--- a/ge/hybrid/common/tensor_value.cc
+++ b/ge/hybrid/common/tensor_value.cc
@@ -71,7 +71,7 @@ TensorValue::TensorValue(void *buffer, size_t size) : ref_buffer_(buffer), ref_s
 TensorValue::~TensorValue() { Destroy(); }
 
 void TensorValue::Destroy() {
-  if (buffer_ != nullptr || ref_buffer_ != nullptr) {
+  if (buffer_ != nullptr) {
     GELOGD("Unref tensor: %s", DebugString().c_str());
     buffer_.reset();
   }
diff --git a/ge/hybrid/executor/hybrid_model_executor.cc b/ge/hybrid/executor/hybrid_model_executor.cc
index c47dafc1..9c4bb217 100755
--- a/ge/hybrid/executor/hybrid_model_executor.cc
+++ b/ge/hybrid/executor/hybrid_model_executor.cc
@@ -71,12 +71,14 @@ Status HybridModelExecutor::ExecuteGraphInternal(SubgraphExecutor &executor,
   GE_CHK_STATUS_RET_NOLOG(ResetExecutionContext(context_));
   RECORD_MODEL_EXECUTION_EVENT(&context_, "[InitContext] End");
 
-  HYBRID_CHK_STATUS_RET(executor.ExecuteAsync(args.inputs, args.input_desc), "Failed to execute partitioned call.");
+  HYBRID_CHK_STATUS_RET(executor.ExecuteAsync(args.inputs, args.input_desc, args.outputs),
+                        "Failed to execute partitioned call.");
   RECORD_MODEL_EXECUTION_EVENT(&context_, "[ExecuteAsync] End");
 
   HYBRID_CHK_STATUS_RET(executor.Synchronize(), "Failed to sync root graph.");
   RECORD_MODEL_EXECUTION_EVENT(&context_, "[Synchronize] End");
 
+  args.outputs.clear();
   HYBRID_CHK_STATUS_RET(executor.GetOutputs(args.outputs, args.output_desc), "Failed to get outputs");
   RECORD_MODEL_EXECUTION_EVENT(&context_, "[GetOutput] End");
   return SUCCESS;
diff --git a/ge/hybrid/executor/subgraph_executor.cc b/ge/hybrid/executor/subgraph_executor.cc
index f8f122b1..8b194233 100644
--- a/ge/hybrid/executor/subgraph_executor.cc
+++ b/ge/hybrid/executor/subgraph_executor.cc
@@ -131,10 +131,14 @@ Status SubgraphExecutor::InitInputsForKnownShape(const std::vector<TensorValue>
 }
 
 Status SubgraphExecutor::ExecuteAsync(const std::vector<TensorValue> &inputs,
-                                      const std::vector<ConstGeTensorDescPtr> &input_desc) {
+                                      const std::vector<ConstGeTensorDescPtr> &input_desc,
+                                      const std::vector<TensorValue> &outputs) {
   GELOGD("[%s] is dynamic = %s", graph_item_->GetName().c_str(), graph_item_->IsDynamic() ? "true" : "false");
   GE_CHK_STATUS_RET(Init(inputs, input_desc), "[%s] Failed to init executor.", graph_item_->GetName().c_str());
-
+  if (!outputs.empty()) {
+    GE_CHK_STATUS_RET(EnableOutputZeroCopy(outputs),
+                      "Failed to enable output zero copy by user provided outputs.");
+  }
   if (!graph_item_->IsDynamic()) {
     return ExecuteAsyncForKnownShape(inputs);
   }
@@ -144,6 +148,11 @@ Status SubgraphExecutor::ExecuteAsync(const std::vector<TensorValue> &inputs,
   return SUCCESS;
 }
 
+Status SubgraphExecutor::ExecuteAsync(const std::vector<TensorValue> &inputs,
+                                      const std::vector<ConstGeTensorDescPtr> &input_desc) {
+  return ExecuteAsync(inputs, input_desc, {});
+}
+
 Status SubgraphExecutor::ExecuteAsyncForKnownShape(const std::vector<TensorValue> &inputs) {
   GELOGD("[%s] subgraph is not dynamic.", graph_item_->GetName().c_str());
   if (graph_item_->GetAllNodes().size() != 1) {
@@ -440,5 +449,37 @@ Status SubgraphExecutor::SetOutputsToParentNode(TaskContext &task_context) {
 
   return SUCCESS;
 }
+
+Status SubgraphExecutor::EnableOutputZeroCopy(const vector<TensorValue> &outputs) {
+  GELOGD("To enable zero copy, output number = %zu", outputs.size());
+  const auto &output_edges = graph_item_->GetOutputEdges();
+  // Op -> MetOutput, set the output tensor of Op that output to the NetOutput node
+  if (outputs.size() != output_edges.size()) {
+    GELOGE(PARAM_INVALID, "Output number mismatches, expect = %zu, but given = %zu",
+           output_edges.size(),
+           outputs.size());
+    return PARAM_INVALID;
+  }
+
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto &output_tensor = outputs[i];
+    auto &output_node = output_edges[i].first;
+    int output_idx = output_edges[i].second;
+    GELOGD("[%s] Set output tensor[%zu] to [%s]'s output[%d], tensor = %s",
+           graph_item_->GetName().c_str(),
+           i,
+           output_node->NodeName().c_str(),
+           output_idx,
+           output_tensor.DebugString().c_str());
+
+    GE_CHK_STATUS_RET(subgraph_context_->SetOutput(*output_node, output_idx, output_tensor),
+                      "[%s] Failed to set input tensor[%zu]",
+                      graph_item_->GetName().c_str(),
+                      i);
+  }
+
+  GELOGD("Done enabling zero copy for outputs successfully.");
+  return SUCCESS;
+}
 }  // namespace hybrid
 }  // namespace ge
diff --git a/ge/hybrid/executor/subgraph_executor.h b/ge/hybrid/executor/subgraph_executor.h
index 4523e2c4..9206ad8d 100644
--- a/ge/hybrid/executor/subgraph_executor.h
+++ b/ge/hybrid/executor/subgraph_executor.h
@@ -43,7 +43,19 @@ class SubgraphExecutor {
    * @param input_desc      input tensor descriptions
    * @return SUCCESS on success, error code otherwise
    */
-  Status ExecuteAsync(const std::vector<TensorValue> &inputs, const std::vector<ConstGeTensorDescPtr> &input_desc);
+  Status ExecuteAsync(const std::vector<TensorValue> &inputs,
+                      const std::vector<ConstGeTensorDescPtr> &input_desc);
+
+  /**
+   * Execute subgraph async, output tensor address(not data) and output tensor descriptions are
+   * valid after this method returned
+   * @param inputs          input tensors
+   * @param input_desc      input tensor descriptions
+   * @return SUCCESS on success, error code otherwise
+   */
+  Status ExecuteAsync(const std::vector<TensorValue> &inputs,
+                      const std::vector<ConstGeTensorDescPtr> &input_desc,
+                      const std::vector<TensorValue> &outputs);
 
   /**
    * Execute subgraph async, output tensor address(not data) and output tensor descriptions are
@@ -75,6 +87,7 @@ class SubgraphExecutor {
   Status GetOutputs(std::vector<TensorValue> &outputs, std::vector<ConstGeTensorDescPtr> &output_desc);
 
  private:
+  Status EnableOutputZeroCopy(const std::vector<TensorValue> &outputs);
   Status PrepareForExecution(GraphExecutionContext *ctx, NodeState &node_state);
   static Status InferShape(ShapeInferenceEngine *shape_inference_engine, NodeState &node_state);
   Status Init(const std::vector<TensorValue> &inputs,
diff --git a/ge/hybrid/model/hybrid_model.cc b/ge/hybrid/model/hybrid_model.cc
index 7e5d8fe5..da9f4fbf 100644
--- a/ge/hybrid/model/hybrid_model.cc
+++ b/ge/hybrid/model/hybrid_model.cc
@@ -40,9 +40,13 @@ HybridModel::~HybridModel() {
   GELOGD("[%s] HybridModel destroyed.", model_name_.c_str());
 }
 
-Status HybridModel::Init() {
+Status HybridModel::Init(bool is_single_op) {
   GELOGD("Start to init hybrid model.");
-  GE_CHK_STATUS_RET(HybridModelBuilder(*this).Build(), "Failed to build hybrid model.");
+  if (is_single_op) {
+    GE_CHK_STATUS_RET(HybridModelBuilder(*this).BuildForSingleOp(), "Failed to build hybrid model.");
+  } else {
+    GE_CHK_STATUS_RET(HybridModelBuilder(*this).Build(), "Failed to build hybrid model.");
+  }
   GELOGD("HybridModel initialized successfully.");
   return SUCCESS;
 }
diff --git a/ge/hybrid/model/hybrid_model.h b/ge/hybrid/model/hybrid_model.h
index 72495cad..8849f57a 100644
--- a/ge/hybrid/model/hybrid_model.h
+++ b/ge/hybrid/model/hybrid_model.h
@@ -37,7 +37,7 @@ class HybridModel {
 
   ~HybridModel();
 
-  Status Init();
+  Status Init(bool is_single_op = false);
 
   const NodeItem *GetNodeItem(const NodePtr &node) const;
 
@@ -136,6 +136,7 @@ class HybridModel {
   uint32_t device_id_ = 0;
   uint32_t model_id_ = 0;
   uint8_t *var_mem_base_ = nullptr;
+  std::unique_ptr<TensorBuffer> weight_buffer_;
   RuntimeParam root_runtime_param_;
 };
 }  // namespace hybrid
diff --git a/ge/hybrid/model/hybrid_model_builder.cc b/ge/hybrid/model/hybrid_model_builder.cc
index b314c6a7..6c071540 100755
--- a/ge/hybrid/model/hybrid_model_builder.cc
+++ b/ge/hybrid/model/hybrid_model_builder.cc
@@ -147,6 +147,21 @@ Status HybridModelBuilder::Build() {
   return SUCCESS;
 }
 
+Status HybridModelBuilder::BuildForSingleOp() {
+  GE_CHK_STATUS_RET(ValidateParams(), "Failed to validate GeRootModel");
+  hybrid_model_.model_name_ = ge_root_model_->GetRootGraph()->GetName();
+  GELOGI("[%s] Start to build hybrid model.", GetGraphName());
+  auto ret = ge_root_model_->GetSubgraphsInstanceNameToModel();
+  const GeModelPtr ge_model = ret[ge_root_model_->GetRootGraph()->GetName()];
+  GE_CHK_STATUS_RET(IndexTaskDefs(ge_root_model_->GetRootGraph(), ge_model),
+                    "[%s] Failed to index task defs", GetGraphName());
+  GE_CHK_STATUS_RET(LoadGraph(), "[%s] Failed to load graph", GetGraphName());
+  GE_CHK_STATUS_RET(InitWeights(), "[%s] Failed to init weights", GetGraphName());
+  GE_CHK_STATUS_RET(LoadTasks(), "[%s] Failed to load tasks", GetGraphName());
+  GELOGI("[%s] Done building hybrid model for single op successfully.", GetGraphName());
+  return SUCCESS;
+}
+
 Status HybridModelBuilder::ValidateParams() {
   GE_CHECK_NOTNULL(ge_root_model_);
   GE_CHECK_NOTNULL(ge_root_model_->GetRootGraph());
@@ -951,46 +966,71 @@ Status HybridModelBuilder::InitVariableTensors() {
 }
 
 Status HybridModelBuilder::InitWeights() {
+  // For constant in root graph
+  const auto &root_graph = ge_root_model_->GetRootGraph();
+  const auto &subgraph_models = ge_root_model_->GetSubgraphInstanceNameToModel();
+  auto iter = subgraph_models.find(root_graph->GetName());
+  if (iter == subgraph_models.end()) {
+    GELOGD("Root graph model not found");
+    return SUCCESS;
+  }
+
+  auto &root_model = iter->second;
+  const auto &weight_buffer = root_model->GetWeight();
+  if (weight_buffer.GetSize == 0) {
+    GELOGD("weight is empty");
+    return SUCCESS;
+  }
+
   auto allocator = NpuMemoryAllocator::GetAllocator();
   GE_CHECK_NOTNULL(allocator);
-
-  for (auto &it : hybrid_model_.node_items_) {
-    auto &node_item = it.second;
-    if (node_item->node_type != CONSTANT) {
+  hybrid_model_.weight_buffer_ = TensorBuffer::Create(allocator, weight_buffer.size());
+  GE_CHECK_NOTNULL(hybrid_model_.weight_buffer_);
+  auto weight_base = reinterpret_cast<uint8_t *>(hybrid_model_.weight_buffer_->GetData());
+  GE_CHK_RT_RET(rtMemcpy(weight_base,
+                         hybrid_model_.weight_buffer_->GetSize(),
+                         weight_buffer.GetData(),
+                         weight_buffer.GetSize(),
+                         RT_MEMCPY_HOST_TO_DEVICE));
+
+  GELOGI("Init weight mem successfully, weight base %p, weight size = %zu",
+         weright_base,
+         hybrid_model_.weight_buffer_->GetSize());
+  for (auto &node : root_graph->GetDirectNode()) {
+    if (node->GetType() != CONSTANT) {
       continue;
     }
 
-    const auto &constant_node = node_item->node;
-    auto op_desc = constant_node->GetOpDesc();
+    auto op_desc = node->GetOpDesc();
     auto v_weights = ModelUtils::GetWeights(op_desc);
     if (v_weights.empty()) {
-      GELOGE(INTERNAL_ERROR, "[%s] Constant has no value", constant_node->GetName().c_str());
+      GELOGE(INTERNAL_ERROR, "[%s] Constant has no value", node->GetName().c_str());
       return INTERNAL_ERROR;
     }
     auto *ge_tensor = const_cast<GeTensor *>(v_weights[0].get());
-    auto output_desc = op_desc->MutableOutputDesc(0);
-    GE_CHECK_NOTNULL(output_desc);
-    auto tensor_size = ge_tensor->GetData().GetSize();
-    GELOGD("[%s] Start to init Constant node [%s], size = %ld",
+    GE_CHECK_NOTNULL(ge_tensor);
+    const GeTensorDesc &tensor_desc = ge_tensor->GetTensorDesc();
+    int64_t tensor_size = 0;
+    GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetSize(&op_desc->MutableOutputDesc(0), tensor_size),
+                            "[%s] Failed to get tensor size",
+                            node->GetName().c_str());
+    int64_t data_offset = 0;
+    GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetDataOffset(tensor_desc, data_offset),
+                            "[%s] Failed to get data offset",
+                            node->GetName().c_str());
+    GELOGD("[%s] Start to init Constant node [%s], size = %ld, offset = %ld",
            GetGraphName(),
            constant_node->GetName().c_str(),
-           tensor_size);
+           tensor_size,
+           data_offset);
 
-    auto tensor_buffer = TensorBuffer::Create(allocator, tensor_size);
+    auto tensor_buffer = TensorBuffer::Create(weight_base + data_offset, tensor_size);
     GE_CHECK_NOTNULL(tensor_buffer);
     std::unique_ptr<TensorValue> constant_tensor(new (std::nothrow)TensorValue(std::move(tensor_buffer)));
     GE_CHECK_NOTNULL(constant_tensor);
     constant_tensor->SetName("Constant_" + op_desc->GetName());
-    if (tensor_size > 0) {
-      GE_CHK_RT_RET(rtMemcpy(constant_tensor->MutableData(),
-                             constant_tensor->GetSize(),
-                             ge_tensor->GetData().data(),
-                             ge_tensor->GetData().size(),
-                             RT_MEMCPY_HOST_TO_DEVICE));
-    }
-
-    hybrid_model_.constant_tensors_.emplace(constant_node, std::move(constant_tensor));
-    GELOGD("[%s] Constant node [%s] added, size = %ld", GetGraphName(), constant_node->GetName().c_str(), tensor_size);
+    hybrid_model_.constant_tensors_.emplace(node, std::move(constant_tensor));
+    GELOGD("[%s] Constant node [%s] added, size = %ld", GetGraphName(), node->GetName().c_str(), tensor_size);
   }
   return SUCCESS;
 }
@@ -1038,6 +1078,53 @@ Status HybridModelBuilder::LoadGeModel(ComputeGraph &sub_graph, const GeModelPtr
   return SUCCESS;
 }
 
+Status HybridModelBuilder::IndexTaskDefs(const ComputeGraphPtrs &sub_graph, const GeModelPtr &ge_model) {
+  // index task defs
+  GELOGD("To index tasks for subgraph: %s", sub_graph->GetName().c_str());
+  std::unordered_map<int64_t, NodePtr> node_map;
+  for (const auto &node : sub_graph->GetDirectNode()) {
+    GE_CHECK_NOTNULL(node);
+    GE_CHECK_NOTNULL(node->GetOpDesc());
+    auto node_id = node->GetOpDesc()->GetId();
+    GELOGD("op_index = %ld, node_name = %s", node_id, node->GetName().c_str());
+    node_map.emplace(node_id, node);
+  }
+
+  auto tasks = ge_model->GetModelTaskDefPtr()->task();
+  for (int i = 0; i < tasks.size(); ++i) {
+    const domi::TaskDef &task_def = tasks[i];
+    GELOGI("Task id = %d, task type = %d", i, task_def.type());
+    auto task_type = static_cast<rtModelTaskType_t>(task_def.type());
+    uint32_t op_index = -1;
+    if (task_type == RT_MODEL_TASK_KERNEL) {
+      op_index = task_def.kernel().context().op_index();
+    } else if (task_type == RT_MODEL_TASK_KERNEL_EX) {
+      op_index = task_def.kernel_ex().op_index();
+    } else if (task_type == RT_MODEL_TASK_HCCL) {
+      op_index = task_def.kernel_hccl().op_index();
+    } else {
+      GELOGD("Skip task type: %d", static_cast<int>(task_type));
+      continue;
+    }
+
+    auto iter = node_map.find(op_index);
+    if (iter == node_map.end()) {
+      GELOGE(INTERNAL_ERROR, "Failed to get node by index = %u", op_index);
+      return INTERNAL_ERROR;
+    }
+
+    auto &node = iter->second;
+    if (task_type == RT_MODEL_TASK_KERNEL) {
+      ge_model->GetTBEKernelStore().LoadTBEKernelBinToOpDesc(node->GetOpDesc());
+    }
+
+    GELOGD("Task loaded for node: %s, task type = %d, op_index = %u", node->GetName().c_str(), task_type, op_index);
+    hybrid_model_.task_defs_[node].emplace_back(task_def);
+  }
+
+  return SUCCESS;
+}
+
 Status HybridModelBuilder::IndexTaskDefs() {
   const auto &root_graph = ge_root_model_->GetRootGraph();
   if (SetOutputNameAttr(*root_graph) != SUCCESS) {
diff --git a/ge/hybrid/model/hybrid_model_builder.h b/ge/hybrid/model/hybrid_model_builder.h
index 045bf3ef..213bf0f4 100644
--- a/ge/hybrid/model/hybrid_model_builder.h
+++ b/ge/hybrid/model/hybrid_model_builder.h
@@ -35,6 +35,7 @@ class HybridModelBuilder {
   explicit HybridModelBuilder(HybridModel &hybrid_model);
   ~HybridModelBuilder() = default;
   Status Build();
+  Status BuildForSingleOp();
 
  private:
   static Status UpdateAnchorStatus(const NodePtr &node);
@@ -64,6 +65,7 @@ class HybridModelBuilder {
   Status ParseDependentInputNodes(NodeItem &node_item, const std::vector<string> &dependencies);
   Status ParseDependentForFusedSubgraph(NodeItem &node_item);
   Status IndexTaskDefs();
+  Status IndexTaskDefs(const ComputeGraphPtrs &sub_graph, const GeModelPtr &ge_model);
   Status IndexSpecialNodes();
   Status InitRuntimeParams();
   Status InitModelMem();
diff --git a/ge/hybrid/node_executor/task_context.cc b/ge/hybrid/node_executor/task_context.cc
index bc318124..039effa2 100644
--- a/ge/hybrid/node_executor/task_context.cc
+++ b/ge/hybrid/node_executor/task_context.cc
@@ -251,6 +251,10 @@ Status TaskContext::AllocateOutput(int index,
     }
   }
 
+  if (outputs_start_[index].GetSize() > 0) {
+    reMemset(output_start_[index].MutableData(), outputs_start_[index].GetSize(), 0, outputs_start_[index].GetSize());
+  }
+
   if (execution_context_->trace_enabled) {
     outputs_start_[index].SetName(node_item_->NodeName() + "_out_" + std::to_string(index));
   }
@@ -397,7 +401,7 @@ Status TaskContext::PropagateOutputs() {
       subgraph_context_->all_inputs_[input_offset] = *tensor;
       if (execution_context_->trace_enabled) {
         subgraph_context_->all_inputs_[input_offset].SetName(
-            node_item_->NodeName() + "_in_" + std::to_string(dst_input_idx));
+            dst_node_item->NodeName() + "_in_" + std::to_string(dst_input_idx));
       }
     }
   }
diff --git a/ge/single_op/single_op.cc b/ge/single_op/single_op.cc
index 4f32bd6b..ee4c84ae 100755
--- a/ge/single_op/single_op.cc
+++ b/ge/single_op/single_op.cc
@@ -256,10 +256,27 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
                                      const vector<DataBuffer> &input_buffers,
                                      vector<GeTensorDesc> &output_desc,
                                      vector<DataBuffer> &output_buffers) {
-  GE_CHECK_NOTNULL(op_task_);
   GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers));
-  std::lock_guard<std::mutex> lk(*stream_mutex_);
+  if (hybrid_model_executor_ != nullptr) {
+    GELOGD("Execute multi-task dynamic single op by hybrid model executor");
+    hybrid::HybridModelExecutor::ExecuteArgs args;
+    for (auto &input : input_buffers) {
+      args.inputs.emplace_back(hybrid::TensorValue(input.data, input.length));
+    }
+    for (auto &output : output_buffers) {
+      args.outputs.emplace_back(hybrid::TensorValue(output.data, output.length));
+    }
+    for (auto &tensor_desc : input_desc) {
+      auto desc = MakeShared<GeTensorDesc>(tensor_desc);
+      GE_CHECK_NOTNULL(desc);
+      args.input_desc.emplace_back(desc);
+    }
 
+    return hybrid_model_executor_->Execute(args);
+  }
+
+  std::lock_guard<std::mutex> lk(*stream_mutex_);
+  GE_CHECK_NOTNULL(op_task_);
   GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_));
   GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get(), kShapeTypeDynamic));
   return SUCCESS;
diff --git a/ge/single_op/single_op.h b/ge/single_op/single_op.h
index d677f94a..b350b684 100755
--- a/ge/single_op/single_op.h
+++ b/ge/single_op/single_op.h
@@ -28,6 +28,7 @@
 #include "runtime/stream.h"
 #include "task/op_task.h"
 #include "cce/aicpu_engine_struct.h"
+#include "hybrid/executor/hybrid_model_executor.h"
 
 namespace ge {
 class StreamResource;
@@ -46,7 +47,7 @@ class SingleOp {
   Status GetArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
 
   friend class SingleOpModel;
-  StreamResource *stream_resource_;
+  StreamResource *stream_resource_ = nullptr;
   std::mutex *stream_mutex_;
   rtStream_t stream_ = nullptr;
   std::vector<void *> input_addr_list_;
@@ -77,6 +78,8 @@ class DynamicSingleOp {
                         std::vector<DataBuffer> &outputs) const;
 
   std::unique_ptr<OpTask> op_task_;
+  std::unique_ptr<hybrid::HybridModel> hybrid_model_;
+  std::unique_ptr<hybrid::HybridModelExecutor> hybrid_model_executor_;
   uintptr_t resource_id_ = 0;
   std::mutex *stream_mutex_;
   rtStream_t stream_ = nullptr;
diff --git a/ge/single_op/single_op_model.cc b/ge/single_op/single_op_model.cc
index 7d092091..d84963dc 100755
--- a/ge/single_op/single_op_model.cc
+++ b/ge/single_op/single_op_model.cc
@@ -31,6 +31,8 @@
 #include "task/aicpu_task_builder.h"
 #include "task/aicpu_kernel_task_builder.h"
 #include "task/tbe_task_builder.h"
+#include "hybrid/executor/hybrid_model_executor.h"
+#include "hybrid/node_executor/node_executor.h"
 
 static std::atomic<std::uint64_t> aicpu_kernel_id(0);
 
@@ -42,6 +44,20 @@ namespace ge {
 namespace {
 const size_t kDataOutputNum = 1;
 }  // namespace
+static Status IfInferDepend(HeModelPtr &ge_model, bool &flag) {
+  auto comp_graph = GraphUtils::GetComputeGraph(ge_model->GetGraph);
+  for (const auto &node : comp_graph->GetAllNodes()) {
+    auto op_desc = node->GetOpDesc();
+    GE_CHECK_NOTNULL(op_desc);
+    const auto &depends = op_desc->GetOpInferDepends();
+    if (!depends.empty()) {
+      flag = true;
+      return SUCCESS;
+    }
+  }
+  return SUCCESS;
+}
+
 SingleOpModel::SingleOpModel(const std::string &model_name, const void *model_data, uint32_t model_size)
     : model_name_(model_name), ori_model_data_(model_data), ori_model_size_(model_size) {}
 
@@ -478,6 +494,26 @@ Status SingleOpModel::BuildDynamicOp(StreamResource &resource, DynamicSingleOp &
   single_op.num_outputs_ = netoutput_op_->GetAllInputsSize();
   GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource));
   model_params_.memory_size = UINT_MAX;
+
+  auto ge_model = model_helper_.GetGeModel();
+  GE_CHECK_NOTNULL(ge_model);
+  bool infer_depend_flag = false;
+  GE_CHK_STATUS_RET_NOLOG(IfInferDepend(ge_model, infer_depend_flag));
+  if (ge_model->GetModelTaskDefPtr()->task_size() > 1 || infer_depend_flag) {
+    GELOGD("Build single op HybridModel.");
+    GE_CHK_STATUS_RET_NOLOG(hybrid::NodeExecutorManager::GetInstance().EnsureInitialized());
+    single_op.hybrid_model_.reset(new (std::nothrow)hybrid::HybridModel(model_helper_.GetGeRootModel()));
+    GE_CHECK_NOTNULL(single_op.hybrid_model_);
+    GE_CHK_STATUS_RET(single_op.hybrid_model_->Init(true), "Failed to init hybrid model");
+    int32_t device_id = 0;
+    GE_CHK_RT_RET(rtGetDevice(&device_id));
+    single_op.hybrid_model_executor_.reset(new (std::nothrow)hybrid::HybridModelExecutor(single_op.hybrid_model_.get(),
+                                                                                         device_id,
+                                                                                         resource.GetStream()));
+    GE_CHECK_NOTNULL(single_op.hybrid_model_executor_);
+    GE_CHK_STATUS_RET(single_op.hybrid_model_executor_->Init(), "Failed to init hybrid model");
+    return SUCCESS;
+  }
   return BuildTaskListForDynamicOp(single_op);
 }
 }  // namespace ge
diff --git a/ge/single_op/stream_resource.cc b/ge/single_op/stream_resource.cc
index db6b7c47..a3acf6b7 100755
--- a/ge/single_op/stream_resource.cc
+++ b/ge/single_op/stream_resource.cc
@@ -61,6 +61,10 @@ DynamicSingleOp *StreamResource::GetDynamicOperator(const void *key) {
   return it->second.get();
 }
 
+rtStream_t StreamResource::GetStream() const {
+  return stream_;
+}
+
 void StreamResource::SetStream(rtStream_t stream) {
   stream_ = stream;
 }
diff --git a/ge/single_op/stream_resource.h b/ge/single_op/stream_resource.h
index d5bc941a..d2c1ca36 100755
--- a/ge/single_op/stream_resource.h
+++ b/ge/single_op/stream_resource.h
@@ -37,6 +37,7 @@ class StreamResource {
   StreamResource(StreamResource &&) = delete;
   StreamResource &operator=(const StreamResource &) = delete;
   StreamResource &operator=(StreamResource &&) = delete;
+  rtStream_t GetStream() const;
   void SetStream(rtStream_t stream);
 
   SingleOp *GetOperator(const void *key);
diff --git a/tests/ut/ge/CMakeLists.txt b/tests/ut/ge/CMakeLists.txt
index b98c8546..11a6ab0b 100755
--- a/tests/ut/ge/CMakeLists.txt
+++ b/tests/ut/ge/CMakeLists.txt
@@ -562,6 +562,46 @@ set(SINGLE_OP_SRC_FILES
     "${GE_CODE_DIR}/ge/single_op/single_op_manager.cc"
     "${GE_CODE_DIR}/ge/single_op/task/aicpu_task_builder.cc"
     "${GE_CODE_DIR}/ge/single_op/task/aicpu_kernel_task_builder.cc"
+    "${GE_CODE_DIR}/ge/hybrid/common/tensor_value.cc"
+    "${GE_CODE_DIR}/ge/hybrid/common/npu_memory_allocator.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/rt_callback_manager.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/node_state.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/node_done_manager.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_profiler.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_model_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_model_async_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/hybrid_execution_context.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/subgraph_context.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/subgraph_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/worker/task_compile_engine.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/worker/shape_inference_engine.cc"
+    "${GE_CODE_DIR}/ge/hybrid/executor/worker/execution_engine.cc"
+    "${GE_CODE_DIR}/ge/hybrid/model/hybrid_model.cc"
+    "${GE_CODE_DIR}/ge/hybrid/model/hybrid_model_builder.cc"
+    "${GE_CODE_DIR}/ge/hybrid/model/node_item.cc"
+    "${GE_CODE_DIR}/ge/hybrid/model/graph_item.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/aicore/aicore_node_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/aicore/aicore_op_task.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/aicore/aicore_task_builder.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/aicore/aicore_task_compiler.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/aicpu/aicpu_ext_info.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/aicpu/aicpu_node_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/compiledsubgraph/known_node_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/ge_local/ge_local_node_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/host_cpu/host_cpu_node_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/host_cpu/kernel_factory.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/host_cpu/kernel/no_op_kernel.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/host_cpu/kernel/variable_kernel.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/host_cpu/kernel/assign_kernel.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/host_cpu/kernel/random_uniform_kernel.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/host_cpu/kernel/data_kernel.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/controlop/control_op_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/partitioned_call/partitioned_call_node_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/hccl/hccl_node_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/rts/rts_node_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/node_executor.cc"
+    "${GE_CODE_DIR}/ge/hybrid/node_executor/task_context.cc"
+    "${GE_CODE_DIR}/ge/hybrid/hybrid_davinci_model.cc"
 )
 
 # test files
diff --git a/tests/ut/ge/single_op/single_op_manager_unittest.cc b/tests/ut/ge/single_op/single_op_manager_unittest.cc
index a70d2984..05da8683 100644
--- a/tests/ut/ge/single_op/single_op_manager_unittest.cc
+++ b/tests/ut/ge/single_op/single_op_manager_unittest.cc
@@ -17,7 +17,6 @@
 #include <gtest/gtest.h>
 #include <vector>
 
-#include "cce/taskdown_common.hpp"
 #include "runtime/rt.h"
 
 #define protected public