Browse Source

fix

pull/2052/head
guopeian 3 years ago
parent
commit
bb608b22b7
2 changed files with 105 additions and 19 deletions
  1. +85
    -0
      ge/single_op/task/op_task.cc
  2. +20
    -19
      ge/single_op/task/op_task.h

+ 85
- 0
ge/single_op/task/op_task.cc View File

@@ -1164,6 +1164,91 @@ Status AiCpuBaseTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
return SUCCESS;
}

Status AiCpuCCTask::InitForSummaryAndCopy() {
if (unknown_type_ != DEPEND_COMPUTE || num_outputs_ == 0) {
GELOGI("Unknown_type is %d, output num is %zu.", unknown_type_, num_outputs_);
return SUCCESS;
}

output_summary_.resize(num_outputs_);
constexpr auto result_summary_size = sizeof(aicpu::FWKAdapter::ResultSummary);
for (size_t i = 0; i < num_outputs_; ++i) {
GE_CHK_RT_RET(rtMalloc(&output_summary_[i], result_summary_size, RT_MEMORY_HBM));
}
output_summary_host_.resize(num_outputs_);

const size_t copy_input_buf_len = num_outputs_ * kCopyNum * sizeof(uint64_t);

GE_CHK_RT_RET(rtMalloc(&copy_input_release_flag_dev_, copy_input_buf_len, RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMalloc(&copy_input_data_size_dev_, copy_input_buf_len, RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMalloc(&copy_input_src_dev_, copy_input_buf_len, RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMalloc(&copy_input_dst_dev_, copy_input_buf_len, RT_MEMORY_HBM));

copy_io_addr_.emplace_back(reinterpret_cast<uintptr_t>(copy_input_release_flag_dev_));
copy_io_addr_.emplace_back(reinterpret_cast<uintptr_t>(copy_input_data_size_dev_));
copy_io_addr_.emplace_back(reinterpret_cast<uintptr_t>(copy_input_src_dev_));
copy_io_addr_.emplace_back(reinterpret_cast<uintptr_t>(copy_input_dst_dev_));
return SUCCESS;
}

Status AiCpuCCTask::SetMemCopyTask(const domi::KernelDef &kernel_def) {
auto &memcpy_args = kernel_def.args();
memcpy_args_size_ = kernel_def.args_size();
memcpy_so_name_ = kernel_def.so_name();
memcpy_kernel_name_ = kernel_def.kernel_name();
if (memcpy_args.size() != memcpy_args_size_) {
REPORT_INNER_ERROR("E19999", "MemCopy task def args.size=%zu, but args_size=%u not equal.",
memcpy_args.size(), memcpy_args_size_);
GELOGE(FAILED, "[Check][Size]MemCopy task def args.size=%zu, but args_size=%u not equal.",
memcpy_args.size(), memcpy_args_size_);
return FAILED;
}
if (memcpy_args_size_ < sizeof(aicpu::AicpuParamHead)) {
REPORT_INNER_ERROR("E19999",
"Task def args_size=%u is less than aicpu param head len=%zu.",
memcpy_args_size_, sizeof(aicpu::AicpuParamHead));
GELOGE(FAILED,
"[Check][Size] Task def args_size=%u is less than aicpu param head len=%zu.",
memcpy_args_size_, sizeof(aicpu::AicpuParamHead));
return FAILED;
}

memcpy_args_.reset(new(std::nothrow) uint8_t[memcpy_args_size_]());
if (memcpy_args_ == nullptr) {
REPORT_INNER_ERROR("E19999", "new memory failed for Node[MemCopy], task_size[%u].",
memcpy_args_size_);
GELOGE(FAILED, "[Malloc][Memory] failed for Node[MemCopy], task_size[%u].",
memcpy_args_size_);
return FAILED;
}

errno_t sec_ret = memcpy_s(memcpy_args_.get(), memcpy_args_size_, memcpy_args.c_str(), memcpy_args.size());
if (sec_ret != EOK) {
REPORT_INNER_ERROR("E19999",
"memcpy_s argc_ failed for Node[MemCopy], ret: %d", sec_ret);
GELOGE(INTERNAL_ERROR,
"[Update][args] failed for Node[MemCopy], ret: %d", sec_ret);
return sec_ret;
}
auto memcpy_param_head = reinterpret_cast<aicpu::AicpuParamHead *>(memcpy_args_.get());
uint32_t memcpy_io_num = memcpy_param_head->ioAddrNum;
auto memcpy_io_addr = memcpy_args_.get() + sizeof(aicpu::AicpuParamHead);
// if has input and output, need copy to ioaddr
int cpy_ret = memcpy_s(memcpy_io_addr, memcpy_args_size_ - sizeof(aicpu::AicpuParamHead),
&copy_io_addr_[0], sizeof(uint64_t) * memcpy_io_num);
if (cpy_ret != 0) {
REPORT_INNER_ERROR("E19999", "Node[Memcpoy] memcpy io addr to AicpuParamHead failed,"
"ret=%d, args_size=%u, io nums=%u.",
cpy_ret, memcpy_args_size_, memcpy_io_num);
GELOGE(INTERNAL_ERROR, "[Update][io_addr]Node[MemCopy] memcpy io addr to AicpuParamHead failed,"
"ret=%d, args_size=%u, io nums=%u.",
cpy_ret, memcpy_args_size_, memcpy_io_num);
return INTERNAL_ERROR;
}
GELOGD("Set memcpy task for node[MemCopy] successfully.");
return SUCCESS;
}

Status AiCpuBaseTask::UpdateArgTable(const SingleOpModelParam &param) {
// aicpu do not have workspace, for now
return DoUpdateArgTable(param, false);


+ 20
- 19
ge/single_op/task/op_task.h View File

@@ -182,6 +182,16 @@ class AiCpuBaseTask : public OpTask {
rtStream_t stream);
Status UpdateOutputShape(vector<GeTensorDesc> &output_desc);
Status UpdateShapeToOutputDesc(const GeShape &shape_new, GeTensorDesc &output_desc);
Status UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
vector<DataBuffer> &outputs,
rtStream_t stream);
Status ReadResultSummaryAndPrepareMemory();

Status PrepareCopyInputs(vector<DataBuffer> &outputs);

Status UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc);

virtual Status CopyDataToHbm(vector<DataBuffer> &outputs, rtStream_t stream) = 0;
// for blocking aicpu op
Status DistributeWaitTaskForAicpuBlockingOp(rtStream_t stream);
Status UpdateEventIdForBlockingAicpuOp();
@@ -197,6 +207,15 @@ class AiCpuBaseTask : public OpTask {
// for blocking aicpu op
bool is_blocking_aicpu_op_ = false;
rtEvent_t rt_event_ = nullptr;
std::vector<void *> output_summary_;
std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_;

void *copy_input_release_flag_dev_ = nullptr;
void *copy_input_data_size_dev_ = nullptr;
void *copy_input_src_dev_ = nullptr;
void *copy_input_dst_dev_ = nullptr;

vector<void *> out_shape_hbm_;
};

class AiCpuTask : public AiCpuBaseTask {
@@ -207,21 +226,12 @@ class AiCpuTask : public AiCpuBaseTask {
Status LaunchKernel(rtStream_t stream) override;
void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;

Status SetMemCopyTask(const domi::KernelExDef &kernel_def);

private:
// for copy task.
Status InitForSummaryAndCopy();
Status UpdateShapeAndDataByResultSummary(vector<GeTensorDesc> &output_desc,
vector<DataBuffer> &outputs,
rtStream_t stream);
Status ReadResultSummaryAndPrepareMemory();

Status CopyDataToHbm(vector<DataBuffer> &outputs, rtStream_t stream);
Status PrepareCopyInputs(vector<DataBuffer> &outputs);

Status UpdateShapeByHbmBuffer(vector<GeTensorDesc> &output_desc);
Status CopyDataToHbm(vector<DataBuffer> &outputs, rtStream_t stream) override;

friend class AiCpuTaskBuilder;
void *workspace_addr_ = nullptr;
@@ -241,17 +251,8 @@ class AiCpuTask : public AiCpuBaseTask {
void *copy_task_args_buf_ = nullptr;
void *copy_workspace_buf_ = nullptr;

std::vector<void *> output_summary_;
std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_;

void *copy_ioaddr_dev_ = nullptr;

void *copy_input_release_flag_dev_ = nullptr;
void *copy_input_data_size_dev_ = nullptr;
void *copy_input_src_dev_ = nullptr;
void *copy_input_dst_dev_ = nullptr;

vector<void *> out_shape_hbm_;
uint64_t kernel_id_ = 0;
};



Loading…
Cancel
Save