diff --git a/ge/graph/build/task_generator.cc b/ge/graph/build/task_generator.cc index dabdc5d2..c9dcf590 100755 --- a/ge/graph/build/task_generator.cc +++ b/ge/graph/build/task_generator.cc @@ -743,6 +743,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP GELOGI("Start AutoFindBpOpIndex"); NodePtr bp_node = nullptr; uint32_t current_idx = 0; + uint32_t netoutput_idx = 0; for (auto &node : graph->GetNodes(graph->GetGraphUnknownFlag())) { OpDescPtr op_desc = node->GetOpDesc(); GE_CHECK_NOTNULL(op_desc); @@ -760,6 +761,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP if (op_desc->GetName() == NODE_NAME_NET_OUTPUT) { if (bp_node == nullptr) { bp_node = node; + netoutput_idx = current_idx - 1; } } if (graph->GetNeedIteration()) { @@ -784,9 +786,13 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP if (bp_node == nullptr) { GELOGW("not find bp_node."); return SUCCESS; + } else if (bp_node->GetName() == NODE_NAME_NET_OUTPUT) { + profiling_point.bp_index = netoutput_idx; + GELOGI("First bp name %s, idx %u", bp_node->GetName().c_str(), netoutput_idx); + } else { + profiling_point.bp_index = FindLastBpFromBpNode(graph, bp_node); } - profiling_point.bp_index = FindLastBpFromBpNode(graph, bp_node); return SUCCESS; } diff --git a/ge/graph/load/model_manager/davinci_model.cc b/ge/graph/load/model_manager/davinci_model.cc index f8b61216..bdba150b 100755 --- a/ge/graph/load/model_manager/davinci_model.cc +++ b/ge/graph/load/model_manager/davinci_model.cc @@ -3727,6 +3727,8 @@ Status DavinciModel::InitTbeHandle(const OpDescPtr &op_desc) { binary.magic = RT_DEV_BINARY_MAGIC_ELF; } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AIVEC") { binary.magic = RT_DEV_BINARY_MAGIC_ELF_AIVEC; + } else if (json_string == "RT_DEV_BINARY_MAGIC_ELF_AICUBE") { + binary.magic = RT_DEV_BINARY_MAGIC_ELF_AICUBE; } else { REPORT_INNER_ERROR("E19999", "Attr:%s value:%s in op:%s(%s), model_id:%u, check invalid", TVM_ATTR_NAME_MAGIC.c_str(), json_string.c_str(), @@ -4007,13 +4009,11 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa iterator_count_++; } - if (!is_async_mode_) { - GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_AFTER_PROC_START)); - ret = CopyOutputData(input_data.index, output_data, RT_MEMCPY_DEVICE_TO_DEVICE); - GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ACL_ERROR_GE_INTERNAL_ERROR, - "[Copy][OutputData] to user failed, ret:%d, model_id:%u.", ret, model_id_); - GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_AFTER_PROC_END)); - } + GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_AFTER_PROC_START)); + ret = CopyOutputData(input_data.index, output_data, RT_MEMCPY_DEVICE_TO_DEVICE); + GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(ret != SUCCESS, return ACL_ERROR_GE_INTERNAL_ERROR, + "[Copy][OutputData] to user failed, ret:%d, model_id:%u.", ret, model_id_); + GE_IF_BOOL_EXEC(profiling_model_execute_on, SetProfileTime(MODEL_AFTER_PROC_END)); // report model time data GE_IF_BOOL_EXEC(profiling_model_execute_on, (void)SinkTimeProfile(input_data)); diff --git a/ge/graph/passes/memcpy_addr_async_pass.cc b/ge/graph/passes/memcpy_addr_async_pass.cc index e8e4ebd8..84ef226a 100755 --- a/ge/graph/passes/memcpy_addr_async_pass.cc +++ b/ge/graph/passes/memcpy_addr_async_pass.cc @@ -47,6 +47,11 @@ Status MemcpyAddrAsyncPass::Run(ComputeGraphPtr graph) { return RT_FAILED; } + if (value == RT_CAPABILITY_NOT_SUPPORT) { + GELOGW("Not support zero copy, skip it."); + return SUCCESS; + } + for (auto &node : graph->GetAllNodes()) { auto op_desc = node->GetOpDesc(); GE_IF_BOOL_EXEC(op_desc == nullptr, continue); diff --git a/ge/hybrid/executor/worker/execution_engine.cc b/ge/hybrid/executor/worker/execution_engine.cc index 32758f61..8eecbc80 100755 --- a/ge/hybrid/executor/worker/execution_engine.cc +++ b/ge/hybrid/executor/worker/execution_engine.cc @@ -428,7 +428,7 @@ Status ExecutionEngine::ValidateInputTensors(const NodeState &node_state, const } int64_t expected_size; - GE_CHK_GRAPH_STATUS_RET(TensorUtils::GetTensorMemorySizeInBytes(*tensor_desc, expected_size)); + (void)TensorUtils::GetSize(*tensor_desc, expected_size); GELOGD("[%s] Input[%d] expects [%ld] bytes.", task_context.GetNodeName(), i, expected_size); auto size_diff = expected_size - static_cast(input_tensor->GetSize()); if (size_diff > 0) { diff --git a/tests/ut/ge/graph/build/task_generator_unittest.cc b/tests/ut/ge/graph/build/task_generator_unittest.cc index 7e996cf1..aa697982 100644 --- a/tests/ut/ge/graph/build/task_generator_unittest.cc +++ b/tests/ut/ge/graph/build/task_generator_unittest.cc @@ -86,3 +86,12 @@ TEST_F(UtestTaskGeneratorTest, FindLastBpFromBpNode) { // netoutput has no data input, return default value 0 EXPECT_EQ(task_generator.FindLastBpFromBpNode(graph, net_output), 0); } + +TEST_F(UtestTaskGeneratorTest, AutoFindBpOpIndex) { + auto graph = BuildGraphBpProfiling(); + TaskGenerator task_generator(nullptr, 0); + auto net_output = graph->FindNode("netoutput"); + ProfilingPoint profiling_point; + vector all_reduce_nodes; + EXPECT_EQ(task_generator.AutoFindBpOpIndex(graph, profiling_point, all_reduce_nodes), SUCCESS); +}