|
|
@@ -564,6 +564,17 @@ AiCpuBaseTask::~AiCpuBaseTask() { |
|
|
|
if (ext_info_addr_dev_ != nullptr) { |
|
|
|
(void)rtFree(ext_info_addr_dev_); |
|
|
|
} |
|
|
|
|
|
|
|
FreeHbm(copy_input_release_flag_dev_); |
|
|
|
FreeHbm(copy_input_data_size_dev_); |
|
|
|
FreeHbm(copy_input_src_dev_); |
|
|
|
FreeHbm(copy_input_dst_dev_); |
|
|
|
for (auto summary : output_summary_) { |
|
|
|
FreeHbm(summary); |
|
|
|
} |
|
|
|
for (auto out_shape : out_shape_hbm_) { |
|
|
|
FreeHbm(out_shape); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
Status AiCpuBaseTask::SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id) { |
|
|
@@ -768,17 +779,7 @@ AiCpuTask::~AiCpuTask() { |
|
|
|
FreeHbm(workspace_addr_); |
|
|
|
FreeHbm(copy_workspace_buf_); |
|
|
|
FreeHbm(copy_ioaddr_dev_); |
|
|
|
FreeHbm(copy_input_release_flag_dev_); |
|
|
|
FreeHbm(copy_input_data_size_dev_); |
|
|
|
FreeHbm(copy_input_src_dev_); |
|
|
|
FreeHbm(copy_input_dst_dev_); |
|
|
|
FreeHbm(copy_task_args_buf_); |
|
|
|
for (auto summary : output_summary_) { |
|
|
|
FreeHbm(summary); |
|
|
|
} |
|
|
|
for (auto out_shape : out_shape_hbm_) { |
|
|
|
FreeHbm(out_shape); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
Status AiCpuTask::LaunchKernel(rtStream_t stream) { |
|
|
@@ -808,7 +809,7 @@ Status AiCpuTask::LaunchKernel(rtStream_t stream) { |
|
|
|
return SUCCESS; |
|
|
|
} |
|
|
|
|
|
|
|
Status AiCpuTask::PrepareCopyInputs(vector<DataBuffer> &outputs) { |
|
|
|
Status AiCpuBaseTask::PrepareCopyInputs(vector<DataBuffer> &outputs) { |
|
|
|
std::vector<uint64_t> copy_input_release_flag; |
|
|
|
std::vector<uint64_t> copy_input_data_size; |
|
|
|
std::vector<uint64_t> copy_input_src; |
|
|
@@ -849,7 +850,7 @@ Status AiCpuTask::PrepareCopyInputs(vector<DataBuffer> &outputs) { |
|
|
|
return SUCCESS; |
|
|
|
} |
|
|
|
|
|
|
|
Status AiCpuTask::ReadResultSummaryAndPrepareMemory() { |
|
|
|
Status AiCpuBaseTask::ReadResultSummaryAndPrepareMemory() { |
|
|
|
for (size_t i = 0; i < num_outputs_; ++i) { |
|
|
|
auto &result_summary = output_summary_host_[i]; |
|
|
|
|
|
|
@@ -876,7 +877,20 @@ Status AiCpuTask::CopyDataToHbm(vector<DataBuffer> &outputs, |
|
|
|
return SUCCESS; |
|
|
|
} |
|
|
|
|
|
|
|
Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) { |
|
|
|
Status AiCpuCCTask::CopyDataToHbm(vector<DataBuffer> &outputs, |
|
|
|
rtStream_t stream) { |
|
|
|
GE_CHK_STATUS_RET_NOLOG(PrepareCopyInputs(outputs)); |
|
|
|
|
|
|
|
auto ret = rtCpuKernelLaunchWithFlag(static_cast<const void *>(memcpy_so_name_.data()), |
|
|
|
static_cast<const void *>(memcpy_kernel_name_.data()), |
|
|
|
block_dim_, memcpy_args_.get(), static_cast<uint32_t>(memcpy_args_size_), |
|
|
|
nullptr, stream, dump_flag_); |
|
|
|
GE_CHK_RT_RET(ret); |
|
|
|
GE_CHK_RT_RET(rtStreamSynchronize(stream)); |
|
|
|
return SUCCESS; |
|
|
|
} |
|
|
|
|
|
|
|
Status AiCpuBaseTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) { |
|
|
|
for (size_t i = 0; i < num_outputs_; ++i) { |
|
|
|
const auto &result_summary = output_summary_host_[i]; |
|
|
|
std::vector<int64_t> shape_dims; |
|
|
@@ -905,7 +919,7 @@ Status AiCpuTask::UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc) { |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
Status AiCpuTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, |
|
|
|
Status AiCpuBaseTask::UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc, |
|
|
|
vector<DataBuffer> &outputs, |
|
|
|
rtStream_t stream) { |
|
|
|
if (num_outputs_ == 0) { |
|
|
@@ -1095,8 +1109,99 @@ Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, |
|
|
|
if (unknown_type_ == DEPEND_SHAPE_RANGE) { |
|
|
|
GE_CHK_RT_RET(rtStreamSynchronize(stream)); |
|
|
|
GE_CHK_STATUS_RET_NOLOG(UpdateOutputShape(output_desc)); |
|
|
|
} else if (unknown_type_ == DEPEND_COMPUTE) { |
|
|
|
GE_CHK_RT_RET(rtStreamSynchronize(stream)); |
|
|
|
GE_CHK_STATUS_RET_NOLOG(UpdateShapeAndDataByResultSummary(output_desc, output_buffers, stream)); |
|
|
|
} |
|
|
|
|
|
|
|
return SUCCESS; |
|
|
|
} |
|
|
|
|
|
|
|
Status AiCpuCCTask::InitForSummaryAndCopy() { |
|
|
|
if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) { |
|
|
|
GELOGI("Unknown_type is %d, output num is %zu.", unknown_type_, num_outputs_); |
|
|
|
return SUCCESS; |
|
|
|
} |
|
|
|
|
|
|
|
output_summary_.resize(num_outputs_); |
|
|
|
constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary); |
|
|
|
for (size_t i = 0; i < num_outputs_; ++i) { |
|
|
|
GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM)); |
|
|
|
} |
|
|
|
output_summary_host_.resize(num_outputs_); |
|
|
|
|
|
|
|
const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t); |
|
|
|
|
|
|
|
GE_CHK_RT_RET(rtMalloc(©_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM)); |
|
|
|
GE_CHK_RT_RET(rtMalloc(©_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM)); |
|
|
|
GE_CHK_RT_RET(rtMalloc(©_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM)); |
|
|
|
GE_CHK_RT_RET(rtMalloc(©_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM)); |
|
|
|
|
|
|
|
std::vector<uint64_t> copy_io_addr; |
|
|
|
copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_)); |
|
|
|
copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_)); |
|
|
|
copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_)); |
|
|
|
copy_io_addr.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_)); |
|
|
|
|
|
|
|
const auto copy_io_addr_size = sizeof(uint64_t) * copy_io_addr.size(); |
|
|
|
|
|
|
|
GE_CHK_RT_RET(rtMalloc(©_ioaddr_dev_, copy_io_addr_size, RT_MEMORY_HBM)); |
|
|
|
|
|
|
|
GE_CHK_RT_RET(rtMemcpy(copy_ioaddr_dev_, copy_io_addr_size, |
|
|
|
copy_io_addr.data(), copy_io_addr_size, RT_MEMCPY_HOST_TO_DEVICE)); |
|
|
|
return SUCCESS; |
|
|
|
} |
|
|
|
|
|
|
|
Status AiCpuCCTask::SetMemCopyTask(const domi::KernelDef &kernel_def) { |
|
|
|
auto &memcpy_args = kernel_def.args(); |
|
|
|
memcpy_args_size_ = kernel_def.args_size(); |
|
|
|
memcpy_so_name_ = kernel_def.so_name(); |
|
|
|
memcpy_kernel_name_ = kernel_def.kernel_name(); |
|
|
|
GE_IF_BOOL_EXEC(memcpy_args.size() != memcpy_args_size_, |
|
|
|
REPORT_INNER_ERROR("E19999", "MemCopy task def args.size=%zu, but args_size=%u not equal.", |
|
|
|
memcpy_args.size(), memcpy_args_size_); |
|
|
|
GELOGE(FAILED, "[Check][Size]MemCopy task def args.size=%zu, but args_size=%u not equal.", |
|
|
|
memcpy_args.size(), memcpy_args_size_); |
|
|
|
return FAILED;); |
|
|
|
GE_IF_BOOL_EXEC(memcpy_args_size_ < sizeof(aicpu::AicpuParamHead), |
|
|
|
REPORT_INNER_ERROR("E19999", |
|
|
|
"Task def args_size=%u is less than aicpu param head len=%zu.", |
|
|
|
memcpy_args_size_, sizeof(aicpu::AicpuParamHead)); |
|
|
|
GELOGE(FAILED, |
|
|
|
"[Check][Size] Task def args_size=%u is less than aicpu param head len=%zu.", |
|
|
|
memcpy_args_size_, sizeof(aicpu::AicpuParamHead)); |
|
|
|
return FAILED;); |
|
|
|
|
|
|
|
memcpy_args_.reset(new(std::nothrow) uint8_t[memcpy_args_size_]()); |
|
|
|
GE_IF_BOOL_EXEC(memcpy_args_ == nullptr, |
|
|
|
REPORT_INNER_ERROR("E19999", "new memory failed for Node[MemCopy], task_size[%u].", |
|
|
|
memcpy_args_size_); |
|
|
|
GELOGE(FAILED, "[Malloc][Memory] failed for Node[MemCopy], task_size[%u].", |
|
|
|
memcpy_args_size_); |
|
|
|
return FAILED;); |
|
|
|
|
|
|
|
errno_t sec_ret = memcpy_s(memcpy_args_.get(), memcpy_args_size_, memcpy_args.c_str(), memcpy_args.size()); |
|
|
|
GE_IF_BOOL_EXEC(sec_ret != EOK, |
|
|
|
REPORT_INNER_ERROR("E19999", |
|
|
|
"memcpy_s argc_ failed for Node[MemCopy], ret: %d", sec_ret); |
|
|
|
GELOGE(INTERNAL_ERROR, |
|
|
|
"[Update][args] failed for Node[MemCopy], ret: %d", sec_ret); |
|
|
|
return sec_ret;); |
|
|
|
auto memcpy_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(memcpy_args_.get()); |
|
|
|
uint32_t memcpy_io_num = memcpy_param_head->ioAddrNum; |
|
|
|
auto memcpy_io_addr = memcpy_args_.get() + sizeof(aicpu::AicpuParamHead); |
|
|
|
// if has input and output, need copy to ioaddr |
|
|
|
int cpy_ret = memcpy_s(memcpy_io_addr, memcpy_args_size_ - sizeof(aicpu::AicpuParamHead), |
|
|
|
©_ioaddr_dev_, sizeof(uint64_t) * memcpy_io_num); |
|
|
|
GE_IF_BOOL_EXEC(cpy_ret != 0, |
|
|
|
REPORT_INNER_ERROR("E19999", "Node[Memcpoy] memcpy io addr to AicpuParamHead failed," |
|
|
|
"ret=%d, args_size=%u, io nums=%u.", |
|
|
|
cpy_ret, memcpy_args_size_, memcpy_io_num); |
|
|
|
GELOGE(INTERNAL_ERROR, "[Update][io_addr]Node[MemCopy] memcpy io addr to AicpuParamHead failed," |
|
|
|
"ret=%d, args_size=%u, io nums=%u.", |
|
|
|
cpy_ret, memcpy_args_size_, memcpy_io_num); |
|
|
|
return INTERNAL_ERROR;); |
|
|
|
GELOGD("Set memcpy task for node[MemCopy] successfully."); |
|
|
|
return SUCCESS; |
|
|
|
} |
|
|
|
|
|
|
|