diff --git a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc index 6ed6866c..1f95d2ce 100755 --- a/ge/hybrid/node_executor/aicore/aicore_node_executor.cc +++ b/ge/hybrid/node_executor/aicore/aicore_node_executor.cc @@ -29,7 +29,8 @@ bool IsNoOp(const NodeItem &node_item) { const auto &tensor_desc = node_item.MutableOutputDesc(i); GE_CHECK_NOTNULL(tensor_desc); const auto &shape = tensor_desc->MutableShape(); - if (shape.IsScalar() || shape.GetShapeSize() > 0) { + if (shape.IsScalar() || shape.GetShapeSize() > 0 || + (node_item.shape_inference_type == DEPEND_SHAPE_RANGE)) { return false; } } @@ -219,11 +220,26 @@ Status AiCoreNodeTask::ExecuteAsync(TaskContext &context, std::function RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeLaunchKernel] End"); } - if (done_callback != nullptr) { - RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeRegisterCallback] Start"); - GE_CHK_STATUS_RET_NOLOG(context.RegisterCallback(done_callback)); - RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeRegisterCallback] End"); - } + auto callback = [=, &context]() { + Status callback_ret = SUCCESS; + if (!tasks_.empyt()) { + auto task = tasks_.back().get(); + if (task->GetUnknownShapeOpType() == DEPEND_SHAPE_RANGE) { + GELOGD("Node[%s] need update outputs shape.", context.GetNodeName()); + callback_ret = task->UpdateOutputsShape(context); + } + } + if (done_callback != nullptr) { + context.SetStatus(callback_ret); + done_callback(); + } + }; + + RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), + "[AiCoreNodeRegisterCallback] Start"); + GE_CHK_STATUS_RET_NOLOG(context.RegisterCallback(callback)); + RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), + "[AiCoreNodeRegisterCallback] End"); GELOGD("[%s] ExecuteAsync End.", context.GetNodeName()); RECORD_EXECUTION_EVENT(context.GetExecutionContext(), context.GetNodeName(), "[AiCoreNodeTaskExecuteAsync] End"); diff --git a/ge/hybrid/node_executor/aicore/aicore_op_task.cc b/ge/hybrid/node_executor/aicore/aicore_op_task.cc index fe9bba9a..ec98b726 100644 --- a/ge/hybrid/node_executor/aicore/aicore_op_task.cc +++ b/ge/hybrid/node_executor/aicore/aicore_op_task.cc @@ -15,13 +15,15 @@ */ #include "hybrid/node_executor/aicore/aicore_op_task.h" -#include "framework/common/taskdown_common.h" + +#include "common/formats/formats.h" +#include "external/graph/types.h" #include "framework/common/debug/log.h" +#include "framework/common/taskdown_common.h" #include "graph/ge_context.h" +#include "graph/load/model_manager/tbe_handle_store.h" #include "hybrid/executor/hybrid_execution_context.h" #include "hybrid/node_executor/aicore/aicore_task_builder.h" -#include "graph/load/model_manager/tbe_handle_store.h" -#include "external/graph/types.h" #include "single_op/task/build_task_utils.h" #include "single_op/task/tbe_task_builder.h" @@ -51,7 +53,42 @@ bool TbeHandleRegistry::AddHandle(std::unique_ptr &&holder) { return ret.second; } -Status AiCoreOpTask::Init(const OpDesc &op_desc, const domi::TaskDef &task_def) { +Status AiCoreOpTask::Init(const OpDesc &op_desc, + const domi::TaskDef &task_def) { + GE_CHK_STATUS_RET_NOLOG(AiCoreOpTask::DoInit(op_desc, task_def)); + int32_t unknown_shape_op_type_val = static_cast(DEPEND_IN_SHAPE); + (void)AttrUtils::GetInt(op_desc, ::ge::ATTR_NAME_UNKNOWN_SHAPE_TYPE, + unknown_shape_op_type_val); + unknown_shape_op_type_ = + static_cast(unknown_shape_op_type_val); + GELOGD("Op [%s] unknown shape type is %d", op_desc.GetName().c_str(), + unknown_shape_op_type_); + if (unknown_shape_op_type_ == DEPEND_SHAPE_RANGE) { + // size,dim1,...,dim8: 9*4=36 + const size_t kDefaultShapeSize = 36; + size_t size = kDefaultShapeSize * op_desc.GetOutputsSize(); + if (size = 0) { + GELOGE(PARAM_INVALID, + "Op [%s] unknown shape type is %d, but outputs size is 0.", + op_desc.GetName().c_str(), unknown_shape_op_type_); + return PARAM_INVALID; + } + auto allocator = NpuMemoryAllocator::GetAllocator(); + GE_CHECK_NOTNULL(allocator); + shape_buffer_ = TensorBuffer::Create(allocator, size); + GE_CHECK_NOTNULL(shape_buffer_); + GELOGD("Op [%s] allocate memory for outputs shape success, size=%zu", + op_desc.GetName().c_str(), size); + vector default_value(size, 0); + GE_CHK_RT_RET(rtMemory(shape_buffer_->GetData(), shape_buffer_->GetSize(), + default_value.data(), size, + RT_MEMCPY_HOST_TO_DEVICE)); + return SUCCESS; + } +} + +Status AiCoreOpTask::DoInit(const OpDesc &op_desc, + const domi::TaskDef &task_def) { op_type_ = op_desc.GetType(); log_name_ = op_desc.GetName() + "_tvmbin"; log_id_ = log_id++; @@ -81,6 +118,89 @@ Status AiCoreOpTask::Init(const OpDesc &op_desc, const domi::TaskDef &task_def) return SUCCESS; } +Status AiCoreOpTask::UpdateOutputsShape(TaskContext &context) const { + GELOGD("Node[%s] start update outputs shape.", context.GetNodeName()); + GE_CHECK_NOTNULL(shape_buffer_); + auto outputs_shape_buffer = + std::unique_ptr(new uint8_t[shape_buffer_->Getsize()]); + GE_CHK_RT_RET(rtMemcpy(outputs_shape_buffer.get(), shape_buffer_->GetSize(), + shape_buffer_->GetData(), shape_buffer_->GetSize(), + RT_MEMCPY_DEVICE_TO_HOST)); + int num_outputs = context.NumOutputs(); + auto outputs_shape = + reinterpret_cast(outputs_shape_buffer.get()); + for (int i = 0; i < num_outputs; ++i) { + if (outputs_shape[i][0] != 0) { + uint32_t dim_num = outputs_shape[i][0]; + const uint32_t kMaxDimNum = 8; + GE_CHECK_LE(dim_num, kMaxDimNum); + vector dims; + for (uint32_t j = 0; j < dim_num; ++j) { + dims.emplace_back(static_cast(outputs_shape[i][j])); + } + auto shape_new = GeShape(dims); + GELOGD("Node[%s] output[%d] shape:%s.", context.GetNodeName(), i, + ToString(dims).c_str()); + GE_CHK_STATUS_RET_NOLOG(UpdateShapeToOutputDesc(context, shape_new, i)); + } + } +} + +Status AiCoreOpTask::UpdateShapeToOutputDesc(TaskContext &context, + const GeShape &shape, + const int output_index) const { + auto output_desc = context.MutableOutputDesc(output_index); + GE_CHECK_NOTNULL(output_desc); + auto shape_old = output_desc->GeShape(); + auto origin_shape_old = output_desc->GetOriginShape(); + GELOGD( + "Node[%s] try to update output[%d] shape from %s to %s, origin_shape " + "from %s to %s.", + context.GetNodeName(), output_index, shape_old.ToString().c_str(), + shape.ToString().c_str(), origin_shape_old.ToString().c_str(), + shape.ToString().c_str()); + auto origin_format = output_desc->GetOriginFormat(); + auto format = output_desc->GetFormat(); + auto node_state = context.GetNodeState(); + GE_CHECK_NOTNULL(node_state); + if (origin_format == format) { + GE_CHK_STATUS_RET( + node_state->UpdateOutputShapes(output_index, shape, shape), + "Node[%s] try to update output[%d] shape from %s to %s, origin_shape " + "from %s to %s failed.", + context.GetNodeName(), output_index, shape_old.ToString().c_str(), + shape.ToString().c_str(), origin_shape_old.ToString().c_str(), + shape.ToString().c_str()); + return SUCCESS; + } + // if format is not same need convert shape + std::vector origin_dims_new; + auto trans_ret = + formats::TransShape(format, shape.GetDims(), output_desc->GetDataType(), + origin_format, origin_dims_new); + GE_CHK_STATUS_RET( + trans_ret, + "[Trans][Shape] failed for node[%s] output[%d], origin_format[%d] " + "is not same as format[%d], shape=%s.", + context.GetNodeName(), output_index, origin_format, format, + shape.ToString().c_str()); + auto origin_shape_new = GeShape(origin_dims_new); + GE_CHK_STATUS_RET( + node_state->UpdateOutputShapes(output_index, shape, origin_shape_new), + "Node[%s] try to update output[%d] shape from %s to %s, origin_shape " + "from %s to %s failed.", + context.GetNodeName(), output_index, shape_old.ToString().c_str(), + shape.ToString().c_str(), origin_shape_old.ToString().c_str(), + origin_shape_new.ToString().c_str()); + GELOGD( + "Node[%s] update output[%d] shape from %s to %s, origin_shape " + "from %s to %s.", + context.GetNodeName(), output_index, shape_old.ToString().c_str(), + shape.ToString().c_str(), origin_shape_old.ToString().c_str(), + origin_shape_new.ToString().c_str()); + return SUCCESS; +} + Status AiCoreOpTask::RegisterTbeHandle(const OpDesc &op_desc) { rtError_t rt_ret = rtQueryFunctionRegistered(stub_name_.c_str()); if (rt_ret != RT_ERROR_NONE) { @@ -429,6 +549,11 @@ Status AiCoreOpTask::UpdateArgs(TaskContext &task_context) { if (tiling_buffer_ != nullptr) { ++expected_arg_count; } + + if (shape_buffer_ != nullptr) { + ++expected_arg_count; + } + if (expected_arg_count > max_arg_count_) { GELOGD("Need to reset size of args_ from %u to %zu.", max_arg_count_, expected_arg_count); auto length = expected_arg_count * sizeof(uintptr_t) + offset_; @@ -465,6 +590,12 @@ Status AiCoreOpTask::UpdateArgs(TaskContext &task_context) { arg_base_[index++] = reinterpret_cast(output->GetData()); } + if (shape_buffer_ != nullptr) { + arg_base_[index++] = reinterpret_cast(shape_buffer_->GetData()); + GELOGD("Node:%s add shape buffer addr to args.", + task_context.GetNodeName()); + } + int workspace_num = static_cast(task_context.NumWorkspaces()); for (int i = 0; i < workspace_num; ++i) { const auto workspace = task_context.MutableWorkspace(i); @@ -567,7 +698,7 @@ std::string AiCoreOpTask::GetKeyForKernelName(const OpDesc &op_desc) const { } Status AtomicAddrCleanOpTask::Init(const OpDesc &op_desc, const domi::TaskDef &task_def) { - GE_CHK_STATUS_RET_NOLOG(AiCoreOpTask::Init(op_desc, task_def)); + GE_CHK_STATUS_RET_NOLOG(AiCoreOpTask::DoInit(op_desc, task_def)); return InitAtomicAddrCleanIndices(op_desc); } diff --git a/ge/hybrid/node_executor/aicore/aicore_op_task.h b/ge/hybrid/node_executor/aicore/aicore_op_task.h index 21a947f2..91c46601 100755 --- a/ge/hybrid/node_executor/aicore/aicore_op_task.h +++ b/ge/hybrid/node_executor/aicore/aicore_op_task.h @@ -82,6 +82,12 @@ class AiCoreOpTask { virtual const std::string& GetOpType() const; + const UnknowShapeOpType GetUnknownShapeOpType() const { + return unknown_shape_op_type_; + } + + Status UpdateOutputsShape(TaskContext &context) const; + protected: Status UpdateTilingInfo(TaskContext &context); virtual std::string GetKeyForOpParamSize() const; @@ -90,6 +96,7 @@ class AiCoreOpTask { virtual std::string GetKeyForTvmMetaData() const; virtual std::string GetKeyForKernelName(const OpDesc &op_desc) const; virtual Status CalcTilingInfo(const NodePtr &node, optiling::utils::OpRunInfo &tiling_info); + Status DoInit(const Opdesc &op_desc, const domi::TaskDef &task_def); std::unique_ptr tiling_buffer_ = nullptr; std::string tiling_data_; @@ -104,6 +111,8 @@ class AiCoreOpTask { Status RegisterKernelHandle(const OpDesc &op_desc); Status InitWithKernelDef(const OpDesc &op_desc, const domi::TaskDef &task_def); Status InitWithKernelDefWithHandle(const OpDesc &node, const domi::TaskDef &task_def); + Status UpdateShapeToOutputDesc(TaskContext &context, const GeShape &shape, + const int output_index) const; std::string stub_name_; void *stub_func_ = nullptr; @@ -122,6 +131,8 @@ class AiCoreOpTask { std::string log_name_; uint32_t offset_ = 0; std::string op_type_; + UnknowShapeOpType unknown_shape_op_type_ = DEPEND_IN_SHAPE; + std::unique_ptr shape_buffer_ = nullptr; }; class AtomicAddrCleanOpTask : public AiCoreOpTask {