diff --git a/ge/graph/build/task_generator.cc b/ge/graph/build/task_generator.cc
index dabdc5d2..c9dcf590 100755
--- a/ge/graph/build/task_generator.cc
+++ b/ge/graph/build/task_generator.cc
@@ -743,6 +743,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
   GELOGI("Start AutoFindBpOpIndex");
   NodePtr bp_node = nullptr;
   uint32_t current_idx = 0;
+  uint32_t netoutput_idx = 0;
   for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) {
     OpDescPtr op_desc = node->GetOpDesc();
     GE_CHECK_NOTNULL(op_desc);
@@ -760,6 +761,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
     if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) {
       if (bp_node == nullptr) {
         bp_node = node;
+        netoutput_idx = current_idx - 1;
       }
     }
     if (graph->GetNeedIteration()) {
@@ -784,9 +786,13 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
   if (bp_node == nullptr) {
     GELOGW("not find bp_node.");
     return SUCCESS;
+  } else if (bp_node->GetName() == NODE_NAME_NET_OUTPUT) {
+    profiling_point.bp_index = netoutput_idx;
+    GELOGI("First bp name %s, idx %u", bp_node->GetName().c_str(), netoutput_idx);
+  } else {
+    profiling_point.bp_index = FindLastBpFromBpNode(graph, bp_node);
   }
 
-  profiling_point.bp_index = FindLastBpFromBpNode(graph, bp_node);
   return SUCCESS;
 }
 
diff --git a/ge/graph/load/model_manager/davinci_model.cc b/ge/graph/load/model_manager/davinci_model.cc
index f8b61216..bdba150b 100755
--- a/ge/graph/load/model_manager/davinci_model.cc
+++ b/ge/graph/load/model_manager/davinci_model.cc
@@ -3727,6 +3727,8 @@ Status DavinciModel::InitTbeHandle(const OpDescPtr &op_desc) {
         binary.magic = RT_DEV_BINARY_MAGIC_ELF;
       } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AIVEC") {
         binary.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC;
+      } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AICUBE") {
+        binary.magic = RT_DEV_BINARY_MAGIC_ELF_AICUBE;
       } else {
         REPORT_INNER_ERROR("E19999", "Attr:%s value:%s in op:%s(%s), model_id:%u, check invalid",
                            TVM_ATTR_NAME_MAGIC.c_str(), json_string.c_str(),
@@ -4007,13 +4009,11 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa
     iterator_count_++;
   }
 
-  if (!is_async_mode_) {
-    GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_AFTER_PROC_START));
-    ret = CopyOutputData(input_data.index, output_data, RT_MEMCPY_DEVICE_TO_DEVICE);
-    GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ACL_ERROR_GE_INTERNAL_ERROR,
-                                   "[Copy][OutputData] to user failed, ret:%d, model_id:%u.", ret, model_id_);
-    GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_AFTER_PROC_END));
-  }
+  GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_AFTER_PROC_START));
+  ret = CopyOutputData(input_data.index, output_data, RT_MEMCPY_DEVICE_TO_DEVICE);
+  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ACL_ERROR_GE_INTERNAL_ERROR,
+                                 "[Copy][OutputData] to user failed, ret:%d, model_id:%u.", ret, model_id_);
+  GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_AFTER_PROC_END));
 
   // report model time data
   GE_IF_BOOL_EXEC(profiling_model_execute_on, (void)SinkTimeProfile(input_data));
diff --git a/ge/graph/passes/memcpy_addr_async_pass.cc b/ge/graph/passes/memcpy_addr_async_pass.cc
index e8e4ebd8..84ef226a 100755
--- a/ge/graph/passes/memcpy_addr_async_pass.cc
+++ b/ge/graph/passes/memcpy_addr_async_pass.cc
@@ -47,6 +47,11 @@ Status MemcpyAddrAsyncPass::Run(ComputeGraphPtr graph) {
     return RT_FAILED;
   }
 
+  if (value == RT_CAPABILITY_NOT_SUPPORT) {
+    GELOGW("Not support zero copy, skip it.");
+    return SUCCESS;
+  }
+
   for (auto &node : graph->GetAllNodes()) {
     auto op_desc = node->GetOpDesc();
     GE_IF_BOOL_EXEC(op_desc == nullptr, continue);
diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc
index 32758f61..8eecbc80 100755
--- a/ge/hybrid/executor/worker/execution_engine.cc
+++ b/ge/hybrid/executor/worker/execution_engine.cc
@@ -428,7 +428,7 @@ Status ExecutionEngine::ValidateInputTensors(const NodeState &node_state, const
     }
 
     int64_t expected_size;
-    GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, expected_size));
+    (void)TensorUtils::GetSize(*tensor_desc, expected_size);
     GELOGD("[%s] Input[%d] expects [%ld] bytes.", task_context.GetNodeName(), i, expected_size);
     auto size_diff = expected_size - static_cast<int64_t>(input_tensor->GetSize());
     if (size_diff > 0) {
diff --git a/tests/ut/ge/graph/build/task_generator_unittest.cc b/tests/ut/ge/graph/build/task_generator_unittest.cc
index 7e996cf1..aa697982 100644
--- a/tests/ut/ge/graph/build/task_generator_unittest.cc
+++ b/tests/ut/ge/graph/build/task_generator_unittest.cc
@@ -86,3 +86,12 @@ TEST_F(UtestTaskGeneratorTest, FindLastBpFromBpNode) {
   // netoutput has no data input, return default value 0
   EXPECT_EQ(task_generator.FindLastBpFromBpNode(graph, net_output), 0);
 }
+
+TEST_F(UtestTaskGeneratorTest, AutoFindBpOpIndex) {
+  auto graph = BuildGraphBpProfiling();
+  TaskGenerator task_generator(nullptr, 0);
+  auto net_output = graph->FindNode("netoutput");
+  ProfilingPoint profiling_point;
+  vector<uint32_t> all_reduce_nodes;
+  EXPECT_EQ(task_generator.AutoFindBpOpIndex(graph, profiling_point, all_reduce_nodes), SUCCESS);
+}