diff --git a/ge/single_op/task/op_task.cc b/ge/single_op/task/op_task.cc index 4dd09c43..b7b638de 100755 --- a/ge/single_op/task/op_task.cc +++ b/ge/single_op/task/op_task.cc @@ -1164,6 +1164,91 @@ Status AiCpuBaseTask::LaunchKernel(const std::vector &input_desc, return SUCCESS; } +Status AiCpuCCTask::InitForSummaryAndCopy() { + if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) { + GELOGI("Unknown_type is %d, output num is %zu.", unknown_type_, num_outputs_); + return SUCCESS; + } + + output_summary_.resize(num_outputs_); + constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary); + for (size_t i = 0; i < num_outputs_; ++i) { + GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM)); + } + output_summary_host_.resize(num_outputs_); + + const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t); + + GE_CHK_RT_RET(rtMalloc(©_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM)); + GE_CHK_RT_RET(rtMalloc(©_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM)); + GE_CHK_RT_RET(rtMalloc(©_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM)); + GE_CHK_RT_RET(rtMalloc(©_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM)); + + copy_io_addr_.emplace_back(reinterpret_cast(copy_input_release_flag_dev_)); + copy_io_addr_.emplace_back(reinterpret_cast(copy_input_data_size_dev_)); + copy_io_addr_.emplace_back(reinterpret_cast(copy_input_src_dev_)); + copy_io_addr_.emplace_back(reinterpret_cast(copy_input_dst_dev_)); + return SUCCESS; +} + +Status AiCpuCCTask::SetMemCopyTask(const domi::KernelDef &kernel_def) { + auto &memcpy_args = kernel_def.args(); + memcpy_args_size_ = kernel_def.args_size(); + memcpy_so_name_ = kernel_def.so_name(); + memcpy_kernel_name_ = kernel_def.kernel_name(); + if (memcpy_args.size() != memcpy_args_size_) { + REPORT_INNER_ERROR("E19999", "MemCopy task def args.size=%zu, but args_size=%u not equal.", + memcpy_args.size(), memcpy_args_size_); + GELOGE(FAILED, "[Check][Size]MemCopy task def args.size=%zu, but args_size=%u not equal.", + memcpy_args.size(), memcpy_args_size_); + return FAILED; + } + if (memcpy_args_size_ < sizeof(aicpu::AicpuParamHead)) { + REPORT_INNER_ERROR("E19999", + "Task def args_size=%u is less than aicpu param head len=%zu.", + memcpy_args_size_, sizeof(aicpu::AicpuParamHead)); + GELOGE(FAILED, + "[Check][Size] Task def args_size=%u is less than aicpu param head len=%zu.", + memcpy_args_size_, sizeof(aicpu::AicpuParamHead)); + return FAILED; + } + + memcpy_args_.reset(new(std::nothrow) uint8_t[memcpy_args_size_]()); + if (memcpy_args_ == nullptr) { + REPORT_INNER_ERROR("E19999", "new memory failed for Node[MemCopy], task_size[%u].", + memcpy_args_size_); + GELOGE(FAILED, "[Malloc][Memory] failed for Node[MemCopy], task_size[%u].", + memcpy_args_size_); + return FAILED; + } + + errno_t sec_ret = memcpy_s(memcpy_args_.get(), memcpy_args_size_, memcpy_args.c_str(), memcpy_args.size()); + if (sec_ret != EOK) { + REPORT_INNER_ERROR("E19999", + "memcpy_s argc_ failed for Node[MemCopy], ret: %d", sec_ret); + GELOGE(INTERNAL_ERROR, + "[Update][args] failed for Node[MemCopy], ret: %d", sec_ret); + return sec_ret; + } + auto memcpy_param_head = reinterpret_cast(memcpy_args_.get()); + uint32_t memcpy_io_num = memcpy_param_head->ioAddrNum; + auto memcpy_io_addr = memcpy_args_.get() + sizeof(aicpu::AicpuParamHead); + // if has input and output, need copy to ioaddr + int cpy_ret = memcpy_s(memcpy_io_addr, memcpy_args_size_ - sizeof(aicpu::AicpuParamHead), + ©_io_addr_[0], sizeof(uint64_t) * memcpy_io_num); + if (cpy_ret != 0) { + REPORT_INNER_ERROR("E19999", "Node[Memcpoy] memcpy io addr to AicpuParamHead failed," + "ret=%d, args_size=%u, io nums=%u.", + cpy_ret, memcpy_args_size_, memcpy_io_num); + GELOGE(INTERNAL_ERROR, "[Update][io_addr]Node[MemCopy] memcpy io addr to AicpuParamHead failed," + "ret=%d, args_size=%u, io nums=%u.", + cpy_ret, memcpy_args_size_, memcpy_io_num); + return INTERNAL_ERROR; + } + GELOGD("Set memcpy task for node[MemCopy] successfully."); + return SUCCESS; +} + Status AiCpuBaseTask::UpdateArgTable(const SingleOpModelParam ¶m) { // aicpu do not have workspace, for now return DoUpdateArgTable(param, false); diff --git a/ge/single_op/task/op_task.h b/ge/single_op/task/op_task.h index 5e2566e3..1769d976 100644 --- a/ge/single_op/task/op_task.h +++ b/ge/single_op/task/op_task.h @@ -182,6 +182,16 @@ class AiCpuBaseTask : public OpTask { rtStream_t stream); Status UpdateOutputShape(vector &output_desc); Status UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc); + Status UpdateShapeAndDataByResultSummary(vector &output_desc, + vector &outputs, + rtStream_t stream); + Status ReadResultSummaryAndPrepareMemory(); + + Status PrepareCopyInputs(vector &outputs); + + Status UpdateShapeByHbmBuffer(vector &output_desc); + + virtual Status CopyDataToHbm(vector &outputs, rtStream_t stream) = 0; // for blocking aicpu op Status DistributeWaitTaskForAicpuBlockingOp(rtStream_t stream); Status UpdateEventIdForBlockingAicpuOp(); @@ -197,6 +207,15 @@ class AiCpuBaseTask : public OpTask { // for blocking aicpu op bool is_blocking_aicpu_op_ = false; rtEvent_t rt_event_ = nullptr; + std::vector output_summary_; + std::vector output_summary_host_; + + void *copy_input_release_flag_dev_ = nullptr; + void *copy_input_data_size_dev_ = nullptr; + void *copy_input_src_dev_ = nullptr; + void *copy_input_dst_dev_ = nullptr; + + vector out_shape_hbm_; }; class AiCpuTask : public AiCpuBaseTask { @@ -207,21 +226,12 @@ class AiCpuTask : public AiCpuBaseTask { Status LaunchKernel(rtStream_t stream) override; void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; - Status SetMemCopyTask(const domi::KernelExDef &kernel_def); private: // for copy task. Status InitForSummaryAndCopy(); - Status UpdateShapeAndDataByResultSummary(vector &output_desc, - vector &outputs, - rtStream_t stream); - Status ReadResultSummaryAndPrepareMemory(); - - Status CopyDataToHbm(vector &outputs, rtStream_t stream); - Status PrepareCopyInputs(vector &outputs); - - Status UpdateShapeByHbmBuffer(vector &output_desc); + Status CopyDataToHbm(vector &outputs, rtStream_t stream) override; friend class AiCpuTaskBuilder; void *workspace_addr_ = nullptr; @@ -241,17 +251,8 @@ class AiCpuTask : public AiCpuBaseTask { void *copy_task_args_buf_ = nullptr; void *copy_workspace_buf_ = nullptr; - std::vector output_summary_; - std::vector output_summary_host_; - void *copy_ioaddr_dev_ = nullptr; - void *copy_input_release_flag_dev_ = nullptr; - void *copy_input_data_size_dev_ = nullptr; - void *copy_input_src_dev_ = nullptr; - void *copy_input_dst_dev_ = nullptr; - - vector out_shape_hbm_; uint64_t kernel_id_ = 0; };