diff --git a/ge/single_op/single_op.cc b/ge/single_op/single_op.cc index 2ab40d82..6e51b6ff 100755 --- a/ge/single_op/single_op.cc +++ b/ge/single_op/single_op.cc @@ -25,6 +25,7 @@ #include "graph/load/new_model_manager/model_utils.h" #include "runtime/mem.h" #include "single_op/single_op_manager.h" +#include "single_op/task/build_task_utils.h" #include "graph/load/new_model_manager/model_manager.h" namespace ge { @@ -77,7 +78,8 @@ Status ProfilingTaskInfo(OpTask *op_task) { } } // namespace -SingleOp::SingleOp(std::mutex *stream_mutex, rtStream_t stream) : stream_mutex_(stream_mutex), stream_(stream) { +SingleOp::SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream) + : stream_resource_(stream_resource), stream_mutex_(stream_mutex), stream_(stream) { } FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() { @@ -159,37 +161,6 @@ Status SingleOp::UpdateArgs(const std::vector &inputs, const std::ve *arg_addr = args_[i]; } } - // update aicpu_TF or aicpu_CC args - for (auto &task : tasks_) { - size_t io_addr_num = args_.size(); - if (task->GetOpTaskType() == OP_TASK_AICPU) { - GELOGD("Update aicpu_TF task args"); - task->SetIoAddrsForDump(args_); - auto *dst_io_addr = const_cast(reinterpret_cast(task->GetIOAddr())); - GE_CHECK_NOTNULL(dst_io_addr); - auto rt_ret = rtMemcpyAsync(dst_io_addr, - sizeof(uint64_t) * args_.size(), - &args_[0], - sizeof(uint64_t) * args_.size(), - RT_MEMCPY_HOST_TO_DEVICE_EX, - stream_); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtMemcpyAsync addresses failed, ret = %d", rt_ret); - return rt_ret; - } - } else if (task->GetOpTaskType() == OP_TASK_AICPUCC) { - GELOGD("Update aicpu_CC task args"); - const uintptr_t *task_io_addr = reinterpret_cast(task->GetIOAddr()); - GE_CHECK_NOTNULL(task_io_addr); - auto io_addr = reinterpret_cast(const_cast(task_io_addr)); - for (size_t i = 0; i < io_addr_num; ++i) { - io_addr[i] = static_cast(args_[i]); - } - } else { - GELOGW("Only TF_kernel aicpu and aicpu_CC are supported, but got %u", task->GetOpTaskType()); - continue; - } - } return SUCCESS; } @@ -200,7 +171,19 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c return ret; } + GE_CHECK_NOTNULL(stream_resource_); std::lock_guard lk(*stream_mutex_); + auto current_mem_base = stream_resource_->GetMemoryBase(); + if (running_param_->mem_base != current_mem_base) { + running_param_->mem_base = const_cast(current_mem_base); + GELOGD("Memory base changed, new memory base = %p", current_mem_base); + for (auto &task : tasks_) { + auto new_address = BuildTaskUtils::GetAddresses(task->GetOpdesc(), *running_param_); + GE_CHK_STATUS_RET(task->UpdateArgTable(*running_param_), + "[%s] Failed to update arg table", + task->GetOpdesc()->GetName().c_str()); + } + } ret = UpdateArgs(inputs, outputs); if (ret != SUCCESS) { return ret; @@ -225,9 +208,6 @@ DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex : resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) { } -DynamicSingleOp::~DynamicSingleOp() { -} - Status DynamicSingleOp::ValidateParams(const vector &input_desc, const std::vector &inputs, std::vector &output_desc, @@ -249,65 +229,24 @@ Status DynamicSingleOp::ValidateParams(const vector &input_desc, } if (input_desc.size() != num_inputs_) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Input number mismatches. expect %zu, but given %zu", - num_inputs_, input_desc.size()); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "Input number mismatches. expect %zu, but given %zu", + num_inputs_, + input_desc.size()); return ACL_ERROR_GE_PARAM_INVALID; } if (output_desc.size() != num_outputs_) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Output number mismatches. expect %zu, but given %zu", - num_outputs_, output_desc.size()); + GELOGE(ACL_ERROR_GE_PARAM_INVALID, + "Output number mismatches. expect %zu, but given %zu", + num_outputs_, + output_desc.size()); return ACL_ERROR_GE_PARAM_INVALID; } return SUCCESS; } -Status DynamicSingleOp::AllocateWorkspaces(const std::vector &workspace_sizes, - std::vector &workspaces) { - static const std::string kPurpose("malloc workspace memory for dynamic op."); - if (workspace_sizes.empty()) { - GELOGD("No need to allocate workspace."); - return SUCCESS; - } - int64_t total_size = 0; - std::vector ws_offsets; - for (auto ws_size : workspace_sizes) { - // alignment and padding should be done in OpParaCalculate - GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size)); - ws_offsets.emplace_back(total_size); - total_size += ws_size; - } - - GELOGD("Total workspace size is %ld", total_size); - StreamResource *stream_resource = SingleOpManager::GetInstance().GetResource(resource_id_, stream_); - GE_CHECK_NOTNULL(stream_resource); - auto ws_base = stream_resource->MallocMemory(kPurpose, static_cast(total_size)); - if (ws_base == nullptr) { - GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size); - return ACL_ERROR_GE_MEMORY_ALLOCATION; - } - GELOGD("Done allocating workspace memory successfully."); - - for (auto ws_offset : ws_offsets) { - workspaces.emplace_back(ws_base + ws_offset); - } - - return SUCCESS; -} - -Status DynamicSingleOp::ExecuteTbeTask(const vector &input_desc, - const vector &inputs, - vector &output_desc, - vector &outputs) { - GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc)); - - std::vector workspace_buffers; - GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers)); - - return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_); -} - Status DynamicSingleOp::ExecuteAsync(const vector &input_desc, const vector &input_buffers, vector &output_desc, @@ -316,32 +255,8 @@ Status DynamicSingleOp::ExecuteAsync(const vector &input_desc, GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers)); std::lock_guard lk(*stream_mutex_); - std::vector inputs; - std::vector outputs; - for (auto &buffer : input_buffers) { - inputs.emplace_back(buffer.data); - } - for (auto &buffer : output_buffers) { - outputs.emplace_back(buffer.data); - } - - if (op_task_->GetOpTaskType() == OP_TASK_TBE) { - auto ret = ExecuteTbeTask(input_desc, inputs, output_desc, outputs); - if (ret == SUCCESS) { - GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get())); - } - return ret; - } else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) { - auto aicpu_ret = op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_); - if (aicpu_ret == SUCCESS) { - GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get())); - } - return aicpu_ret; - } else { - GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID, - "Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u", - op_task_->GetOpTaskType()); - return ACL_ERROR_GE_OP_TASK_TYPE_INVALID; - } + GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_)); + GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get())); + return SUCCESS; } } // namespace ge diff --git a/ge/single_op/single_op.h b/ge/single_op/single_op.h index 14ef8ce1..d677f94a 100755 --- a/ge/single_op/single_op.h +++ b/ge/single_op/single_op.h @@ -30,9 +30,11 @@ #include "cce/aicpu_engine_struct.h" namespace ge { +class StreamResource; +struct SingleOpModelParam; class SingleOp { public: - SingleOp(std::mutex *stream_mutex, rtStream_t stream); + SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream); ~SingleOp(); Status ExecuteAsync(const std::vector &inputs, const std::vector &outputs); @@ -44,6 +46,7 @@ class SingleOp { Status GetArgs(const std::vector &inputs, const std::vector &outputs); friend class SingleOpModel; + StreamResource *stream_resource_; std::mutex *stream_mutex_; rtStream_t stream_ = nullptr; std::vector input_addr_list_; @@ -54,12 +57,13 @@ class SingleOp { std::vector tasks_; std::vector> arg_table_; + std::unique_ptr running_param_; }; class DynamicSingleOp { public: DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream); - ~DynamicSingleOp(); + ~DynamicSingleOp() = default; Status ExecuteAsync(const vector &input_desc, const std::vector &inputs, std::vector &output_desc, @@ -72,14 +76,6 @@ class DynamicSingleOp { std::vector &output_desc, std::vector &outputs) const; - Status AllocateWorkspaces(const std::vector &workspace_sizes, - std::vector &workspaces); - - Status ExecuteTbeTask(const vector &input_desc, - const vector &inputs, - vector &output_desc, - vector &outputs); - std::unique_ptr op_task_; uintptr_t resource_id_ = 0; std::mutex *stream_mutex_; diff --git a/ge/single_op/single_op_model.cc b/ge/single_op/single_op_model.cc index 525f479b..6b4f6b04 100755 --- a/ge/single_op/single_op_model.cc +++ b/ge/single_op/single_op_model.cc @@ -92,7 +92,8 @@ Status SingleOpModel::InitModelMem(StreamResource &res) { if (model_params_.memory_size > model_params_.zero_copy_mem_size) { const string purpose("malloc feature map memory on model execute."); GELOGI("total memory: %lu, zero_copy_mem: %lu", model_params_.memory_size, model_params_.zero_copy_mem_size); - model_params_.mem_base = res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size); + model_params_.mem_base = + res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size, false); if (model_params_.mem_base == nullptr) { return ACL_ERROR_GE_MEMORY_ALLOCATION; } @@ -226,9 +227,10 @@ Status SingleOpModel::SetInputsAndOutputs(SingleOp &single_op) { return SUCCESS; } -Status SingleOpModel::BuildTaskList(SingleOp &single_op) { +Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &single_op) { auto ge_model = model_helper_.GetGeModel(); GE_CHECK_NOTNULL(ge_model); + single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size()); auto tasks = ge_model->GetModelTaskDefPtr()->task(); for (int i = 0; i < tasks.size(); ++i) { const TaskDef &task_def = tasks[i]; @@ -247,9 +249,11 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { return ret; } - single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size()); ParseArgTable(tbe_task, single_op); tbe_task->SetModelArgs(model_name_, model_id_); + if (tbe_task->tiling_buffer_ != nullptr) { + tbe_task->stream_resource_ = stream_resource; + } single_op.tasks_.emplace_back(tbe_task); } else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) { GELOGD("Building AICPU_CC task"); @@ -261,6 +265,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { return ret; } task->SetModelArgs(model_name_, model_id_); + ParseArgTable(task, single_op); single_op.tasks_.emplace_back(task); } else { GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID, @@ -278,6 +283,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { return ret; } aicpu_task->SetModelArgs(model_name_, model_id_); + ParseArgTable(aicpu_task, single_op); single_op.tasks_.emplace_back(aicpu_task); } else { // skip @@ -287,21 +293,23 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { return SUCCESS; } -void SingleOpModel::ParseArgTable(TbeOpTask *task, SingleOp &op) { +void SingleOpModel::ParseArgTable(OpTask *task, SingleOp &op) { if (task == nullptr) { GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "tbe op task is nullptr"); return; } + // args: addr1, addr2, addr3 ... - auto *args = const_cast(reinterpret_cast(task->GetArgs())); - size_t arg_size = task->GetArgSize(); - for (size_t i = 0; i < arg_size / sizeof(void *); ++i) { - uintptr_t *ptr_to_addr = args + i; + uintptr_t *arg_base = nullptr; + size_t arg_num = 0; + task->GetIoAddr(arg_base, arg_num); + for (size_t i = 0; i < arg_num; ++i) { + uintptr_t *ptr_to_addr = arg_base + i; uintptr_t addr = *ptr_to_addr; auto iter = model_params_.addr_mapping_.find(addr); if (iter != model_params_.addr_mapping_.end()) { int arg_index = iter->second; - GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetStubName().c_str(), i, arg_index); + GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetOpdesc()->GetName().c_str(), i, arg_index); op.arg_table_[iter->second].emplace_back(ptr_to_addr); } } @@ -386,8 +394,10 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) { GE_CHK_STATUS_RET_NOLOG(ParseInputsAndOutputs()); GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource)); + single_op.running_param_.reset(new (std::nothrow)SingleOpModelParam(model_params_)); + GE_CHECK_NOTNULL(single_op.running_param_); GE_CHK_STATUS_RET_NOLOG(SetInputsAndOutputs(single_op)); - return BuildTaskList(single_op); + return BuildTaskList(&resource, single_op); } Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) { diff --git a/ge/single_op/single_op_model.h b/ge/single_op/single_op_model.h index 5f1c842a..c3164543 100755 --- a/ge/single_op/single_op_model.h +++ b/ge/single_op/single_op_model.h @@ -65,7 +65,7 @@ class SingleOpModel { Status ParseInputNode(const OpDescPtr &op_desc); void ParseOutputNode(const OpDescPtr &op_desc); - Status BuildTaskList(SingleOp &single_op); + Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op); Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op); Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task); Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, @@ -74,7 +74,7 @@ class SingleOpModel { Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op); static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam ¶m); - void ParseArgTable(TbeOpTask *task, SingleOp &op); + void ParseArgTable(OpTask *task, SingleOp &op); std::string model_name_; uint32_t model_id_ = 0; diff --git a/ge/single_op/stream_resource.cc b/ge/single_op/stream_resource.cc index f545b6c8..722a1024 100755 --- a/ge/single_op/stream_resource.cc +++ b/ge/single_op/stream_resource.cc @@ -69,11 +69,25 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose, size_t size, size_t &max_allocated, std::vector &allocated) { + if (size == 0) { + GELOGD("Mem size == 0"); + return nullptr; + } + if (size <= max_allocated && !allocated.empty()) { GELOGD("reuse last memory"); return allocated.back(); } + if (!allocated.empty()) { + uint8_t *current_buffer = allocated.back(); + allocated.pop_back(); + if (rtStreamSynchronize(stream_) != RT_ERROR_NONE) { + GELOGW("Failed to invoke rtStreamSynchronize"); + } + (void) rtFree(current_buffer); + } + uint8_t *buffer = nullptr; auto ret = rtMalloc(reinterpret_cast(&buffer), size, RT_MEMORY_HBM); if (ret != RT_ERROR_NONE) { @@ -96,10 +110,14 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose, return buffer; } -uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size) { +uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size, bool holding_lock) { GELOGD("To Malloc memory, size = %zu", size); - uint8_t *buffer = DoMallocMemory(purpose, size, max_memory_size_, memory_list_); - return buffer; + if (holding_lock) { + return DoMallocMemory(purpose, size, max_memory_size_, memory_list_); + } else { + std::lock_guard lk(stream_mu_); + return DoMallocMemory(purpose, size, max_memory_size_, memory_list_); + } } uint8_t *StreamResource::MallocWeight(const std::string &purpose, size_t size) { @@ -158,7 +176,7 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData & return ret; } - auto new_op = std::unique_ptr(new(std::nothrow) SingleOp(&stream_mu_, stream_)); + auto new_op = std::unique_ptr(new(std::nothrow) SingleOp(this, &stream_mu_, stream_)); if (new_op == nullptr) { GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "new SingleOp failed"); return ACL_ERROR_GE_MEMORY_ALLOCATION; @@ -171,4 +189,12 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData & op_map_[model_data.model_data] = std::move(new_op); return SUCCESS; } + +const uint8_t *StreamResource::GetMemoryBase() const { + if (memory_list_.empty()) { + return nullptr; + } + + return memory_list_.back(); +} } // namespace ge diff --git a/ge/single_op/stream_resource.h b/ge/single_op/stream_resource.h index 39f08ebe..d5bc941a 100755 --- a/ge/single_op/stream_resource.h +++ b/ge/single_op/stream_resource.h @@ -45,8 +45,9 @@ class StreamResource { Status BuildOperator(const std::string &model_name, const ModelData &model_data, SingleOp **single_op); Status BuildDynamicOperator(const std::string &model_name, const ModelData &model_data, DynamicSingleOp **single_op); - uint8_t *MallocMemory(const std::string &purpose, size_t size); + uint8_t *MallocMemory(const std::string &purpose, size_t size, bool holding_lock = true); uint8_t *MallocWeight(const std::string &purpose, size_t size); + const uint8_t *GetMemoryBase() const; private: uint8_t *DoMallocMemory(const std::string &purpose, diff --git a/ge/single_op/task/aicpu_kernel_task_builder.cc b/ge/single_op/task/aicpu_kernel_task_builder.cc index cd218c94..c676ccf8 100755 --- a/ge/single_op/task/aicpu_kernel_task_builder.cc +++ b/ge/single_op/task/aicpu_kernel_task_builder.cc @@ -17,17 +17,22 @@ #include "single_op/task/aicpu_kernel_task_builder.h" #include "framework/common/taskdown_common.h" #include "graph/load/new_model_manager/model_manager.h" +#include "build_task_utils.h" namespace ge { AiCpuCCTaskBuilder::AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def) : op_desc_(op_desc), kernel_def_(kernel_def) {} -Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) { +Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam ¶m) { size_t aicpu_arg_size = kernel_def_.args_size(); - if (aicpu_arg_size <= 0) { + if (aicpu_arg_size <= sizeof(aicpu::AicpuParamHead)) { GELOGE(ACL_ERROR_GE_PARAM_INVALID, "aicpu_arg_size is invalid, value = %zu", aicpu_arg_size); return ACL_ERROR_GE_PARAM_INVALID; } + + task.io_addr_num_ = op_desc_->GetInputsSize() + op_desc_->GetOutputsSize(); + GE_CHECK_GE(aicpu_arg_size - sizeof(aicpu::AicpuParamHead), task.io_addr_num_ * sizeof(void *)); + std::unique_ptr aicpu_args; aicpu_args.reset(new(std::nothrow) uint8_t[aicpu_arg_size]()); if (aicpu_args == nullptr) { @@ -41,13 +46,19 @@ Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) { return ACL_ERROR_GE_INTERNAL_ERROR; } - task.SetIoAddr(aicpu_args.get() + sizeof(aicpu::AicpuParamHead)); + task.SetIoAddr(reinterpret_cast(aicpu_args.get() + sizeof(aicpu::AicpuParamHead))); task.SetKernelArgs(std::move(aicpu_args), aicpu_arg_size); + + auto addresses = BuildTaskUtils::GetKernelArgs(op_desc_, param); + GE_CHECK_GE(addresses.size(), task.io_addr_num_); + for (size_t i = 0; i < task.io_addr_num_; ++i) { + task.io_addr_[i] = reinterpret_cast(addresses[i]); + } return SUCCESS; } -Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) { - auto ret = SetKernelArgs(task); +Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam ¶m) { + auto ret = SetKernelArgs(task, param); if (ret != SUCCESS) { return ret; } diff --git a/ge/single_op/task/aicpu_kernel_task_builder.h b/ge/single_op/task/aicpu_kernel_task_builder.h index e77e3c10..85d5034d 100755 --- a/ge/single_op/task/aicpu_kernel_task_builder.h +++ b/ge/single_op/task/aicpu_kernel_task_builder.h @@ -30,10 +30,10 @@ class AiCpuCCTaskBuilder { explicit AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def); ~AiCpuCCTaskBuilder() = default; - Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id); + Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam ¶m); private: - Status SetKernelArgs(AiCpuCCTask &task); + Status SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam ¶m); const OpDescPtr op_desc_; const domi::KernelDef &kernel_def_; }; diff --git a/ge/single_op/task/aicpu_task_builder.cc b/ge/single_op/task/aicpu_task_builder.cc index 8f28ffda..0cc5c554 100755 --- a/ge/single_op/task/aicpu_task_builder.cc +++ b/ge/single_op/task/aicpu_task_builder.cc @@ -26,26 +26,6 @@ namespace ge { AiCpuTaskBuilder::AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def) : op_desc_(op_desc), kernel_def_(kernel_def) {} - Status AiCpuTaskBuilder::SetInputOutputAddr(void **io_addr, const std::vector &addresses) { - size_t arg_size = kernel_def_.args_size(); - auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM); - if (rt_ret != RT_ERROR_NONE) { - GELOGE(rt_ret, "rtMalloc failed, size = %zu, ret = %d", arg_size, rt_ret); - return rt_ret; - } - - const void *src_addr = reinterpret_cast(addresses.data()); - uint64_t src_len = sizeof(void *) * addresses.size(); - rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_DEVICE); - if (rt_ret != RT_ERROR_NONE) { - (void)rtFree(*io_addr); - GELOGE(rt_ret, "rtMemcpy addresses failed, ret = %d", rt_ret); - return rt_ret; - } - - return SUCCESS; - } - Status AiCpuTaskBuilder::SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &fwk_op_kernel) { auto sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), kernel_def_.args().data(), kernel_def_.args().size()); @@ -80,39 +60,27 @@ namespace ge { return SUCCESS; } - Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, - const SingleOpModelParam ¶m, bool dynamic_flag) { + Status AiCpuTaskBuilder::InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam ¶m, bool dynamic_flag) { if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) { GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size()); return ACL_ERROR_GE_PARAM_INVALID; } - auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param); - auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace); - - if (dynamic_flag) { - GE_CHK_RT_RET(rtMalloc(kernel_workspace, kernel_def_.task_info_size(), RT_MEMORY_HBM)); - } else { - if (ws_addr_vec.empty()) { - GELOGE(ACL_ERROR_GE_PARAM_INVALID, "workspace Data Address is empty."); - return ACL_ERROR_GE_PARAM_INVALID; - } - *kernel_workspace = ws_addr_vec[0]; - } - GE_CHK_RT_RET(rtMemcpy(*kernel_workspace, kernel_def_.task_info_size(), + GE_CHK_RT_RET(rtMalloc(&task.workspace_addr_, kernel_def_.task_info_size(), RT_MEMORY_HBM)); + GE_CHK_RT_RET(rtMemcpy(task.workspace_addr_, kernel_def_.task_info_size(), kernel_def_.task_info().data(), kernel_def_.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE)); - auto ret = SetInputOutputAddr(io_addr, BuildTaskUtils::JoinAddresses(addresses)); - if (ret != SUCCESS) { - return ret; - } + auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false); + task.io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses); + task.io_addr_size_ = task.io_addr_host_.size() * sizeof(void *); + GE_CHK_RT_RET(rtMalloc(&task.io_addr_, task.io_addr_size_, RT_MEMORY_HBM)); return SUCCESS; } Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam ¶m, bool dynamic_flag, uint64_t kernel_id) { - GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&task.io_addr_, &task.workspace_addr_, param, dynamic_flag)); + GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(task, param, dynamic_flag)); STR_FWK_OP_KERNEL fwk_op_kernel = {0}; auto ret = SetFmkOpKernel(task.io_addr_, task.workspace_addr_, fwk_op_kernel); diff --git a/ge/single_op/task/aicpu_task_builder.h b/ge/single_op/task/aicpu_task_builder.h index 4669e118..fe9c9bc2 100755 --- a/ge/single_op/task/aicpu_task_builder.h +++ b/ge/single_op/task/aicpu_task_builder.h @@ -33,10 +33,8 @@ namespace ge { private: static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel); - Status SetInputOutputAddr(void **io_addr, const std::vector &addresses); Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel); - Status InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, - const SingleOpModelParam ¶m, bool dynamic_flag); + Status InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam ¶m, bool dynamic_flag); const OpDescPtr op_desc_; const domi::KernelExDef &kernel_def_; diff --git a/ge/single_op/task/build_task_utils.cc b/ge/single_op/task/build_task_utils.cc index 29f1657b..071e514b 100644 --- a/ge/single_op/task/build_task_utils.cc +++ b/ge/single_op/task/build_task_utils.cc @@ -32,7 +32,8 @@ const uint64_t kVarSize = 0; } std::vector> BuildTaskUtils::GetAddresses(const OpDescPtr &op_desc, - const SingleOpModelParam ¶m) { + const SingleOpModelParam ¶m, + bool keep_workspace) { std::vector> ret; RuntimeParam runtime_para; runtime_para.mem_size = param.memory_size; @@ -49,7 +50,9 @@ std::vector> BuildTaskUtils::GetAddresses(const OpDescPtr &o ret.emplace_back(ModelUtils::GetInputDataAddrs(runtime_para, op_desc)); ret.emplace_back(ModelUtils::GetOutputDataAddrs(runtime_para, op_desc)); - ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc)); + if (keep_workspace) { + ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc)); + } return ret; } diff --git a/ge/single_op/task/build_task_utils.h b/ge/single_op/task/build_task_utils.h index cddc7a2b..7a2369e4 100644 --- a/ge/single_op/task/build_task_utils.h +++ b/ge/single_op/task/build_task_utils.h @@ -27,15 +27,17 @@ namespace ge { class BuildTaskUtils { public: + static constexpr int kAddressIndexOutput = 1; static constexpr int kAddressIndexWorkspace = 2; - static std::vector> GetAddresses(const OpDescPtr &op_desc, const SingleOpModelParam ¶m); + static std::vector> GetAddresses(const OpDescPtr &op_desc, + const SingleOpModelParam ¶m, + bool keep_workspace = true); static std::vector JoinAddresses(const std::vector> &addresses); static std::vector GetKernelArgs(const OpDescPtr &op_desc, const SingleOpModelParam ¶m); static std::string GetTaskInfo(const OpDescPtr &op_desc); template - static std::string VectorToString(const std::vector &values) - { + static std::string VectorToString(const std::vector &values) { std::stringstream ss; ss << '['; auto size = values.size(); diff --git a/ge/single_op/task/op_task.cc b/ge/single_op/task/op_task.cc index f8b019e9..a714c6a8 100755 --- a/ge/single_op/task/op_task.cc +++ b/ge/single_op/task/op_task.cc @@ -24,9 +24,11 @@ #include "common/dump/dump_manager.h" #include "common/dump/dump_op.h" #include "common/formats/formats.h" +#include "common/math/math_util.h" #include "framework/common/debug/log.h" #include "register/op_tiling.h" #include "runtime/rt.h" +#include "build_task_utils.h" namespace ge { namespace { @@ -48,18 +50,22 @@ Status OpTask::OpenDump(rtStream_t stream) { std::vector output_adds; auto input_size = op_desc_->GetInputsSize(); auto output_size = op_desc_->GetOutputsSize(); - auto all_size = io_addrs_for_dump_.size(); - if (input_size + output_size != all_size) { - GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu", all_size, + uintptr_t *arg_base = nullptr; + size_t arg_num = 0; + GetIoAddr(arg_base, arg_num); + if (arg_num < input_size + output_size) { + GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu", + arg_num, input_size + output_size); return FAILED; } + for (size_t i = 0; i < input_size; i++) { - uint64_t input_addr = io_addrs_for_dump_[i]; + uint64_t input_addr = arg_base[i]; input_addrs.emplace_back(input_addr); } for (size_t j = 0; j < output_size; j++) { - uint64_t output_addr = io_addrs_for_dump_[input_size + j]; + uint64_t output_addr = arg_base[input_size + j]; output_adds.emplace_back(output_addr); } dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream); @@ -89,10 +95,6 @@ void TbeOpTask::SetKernelArgs(std::unique_ptr &&args, size_t arg_size void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; } -const vector &OpTask::GetWorkspaceSizes() const { return workspace_sizes_; } - -void OpTask::SetWorkspaceSizes(const vector &workspace_sizes) { workspace_sizes_ = workspace_sizes; } - void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) { model_name_ = model_name; model_id_ = model_id; @@ -107,6 +109,36 @@ Status OpTask::GetProfilingArgs(std::string &model_name, std::string &op_name, u op_name = op_desc_->GetName(); return SUCCESS; } +Status OpTask::UpdateRunInfo(const vector &input_desc, const vector &output_desc) { + return UNSUPPORTED; +} +Status OpTask::UpdateArgTable(const SingleOpModelParam ¶m) { + auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param); + auto all_addresses = BuildTaskUtils::JoinAddresses(addresses); + uintptr_t *arg_base = nullptr; + size_t arg_num = 0; + GetIoAddr(arg_base, arg_num); + if (arg_num != all_addresses.size()) { + GELOGE(INTERNAL_ERROR, "[%s] arg number mismatches, expect = %zu, but got = %zu", + op_desc_->GetName().c_str(), + arg_num, + all_addresses.size()); + return INTERNAL_ERROR; + } + + for (void *addr : all_addresses) { + *arg_base++ = reinterpret_cast(addr); + } + return SUCCESS; +} + +Status OpTask::LaunchKernel(const vector &input_desc, + const vector &input_buffers, + vector &output_desc, + vector &output_buffers, + rtStream_t stream) { + return UNSUPPORTED; +} TbeOpTask::~TbeOpTask() { if (sm_desc_ != nullptr) { @@ -141,12 +173,6 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) { return RT_FAILED; } GELOGI("[TASK_INFO] %s", this->stub_name_.c_str()); - - size_t input_size = op_desc_->GetInputsSize(); - size_t output_size = op_desc_->GetOutputsSize(); - uint64_t *io_addr = reinterpret_cast(args_.get()); - std::vector io_addrs(io_addr, io_addr + input_size + output_size); - SetIoAddrsForDump(io_addrs); auto status = OpenDump(stream); if (status != SUCCESS) { GELOGE(status, "Open dump failed in the tbe single op %s", this->stub_name_.c_str()); @@ -167,11 +193,12 @@ Status TbeOpTask::UpdateRunInfo(const vector &input_desc, const ve GELOGE(FAILED, "Failed to invoke OpParaCalculate. ret = %u", ret); return FAILED; } - SetWorkspaceSizes(run_info.workspaces); block_dim_ = run_info.block_dim; tiling_data_ = run_info.tiling_data.str(); GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu", block_dim_, tiling_data_.size()); + + GE_CHK_STATUS_RET(AllocateWorkspaces(run_info.workspaces), "Failed to allocate workspaces"); return SUCCESS; } @@ -227,13 +254,54 @@ void TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, s max_tiling_size_ = max_tiling_size; } -Status TbeOpTask::LaunchKernel(const vector &inputs, const vector &outputs, - const vector &workspaces, rtStream_t stream) { +Status TbeOpTask::AllocateWorkspaces(const vector &workspace_sizes) { + static const std::string kPurpose("malloc workspace memory for dynamic op."); + if (workspace_sizes.empty()) { + GELOGD("No need to allocate workspace."); + return SUCCESS; + } + int64_t total_size = 0; + std::vector ws_offsets; + for (auto ws_size : workspace_sizes) { + // alignment and padding should be done in OpParaCalculate + GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size)); + ws_offsets.emplace_back(total_size); + total_size += ws_size; + } + + GELOGD("Total workspace size is %ld", total_size); + GE_CHECK_NOTNULL(stream_resource_); + auto ws_base = stream_resource_->MallocMemory(kPurpose, static_cast(total_size)); + if (ws_base == nullptr) { + GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size); + return ACL_ERROR_GE_MEMORY_ALLOCATION; + } + GELOGD("Done allocating workspace memory successfully."); + + for (auto ws_offset : ws_offsets) { + workspaces_.emplace_back(ws_base + ws_offset); + } + + return SUCCESS; +} + +Status TbeOpTask::LaunchKernel(const vector &input_desc, + const vector &input_buffers, + vector &output_desc, + vector &output_buffers, + rtStream_t stream) { + GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo(input_desc, output_desc)); GELOGD("[%s] Start to launch kernel", node_->GetName().c_str()); std::vector args; - args.insert(args.end(), inputs.begin(), inputs.end()); - args.insert(args.end(), outputs.begin(), outputs.end()); - args.insert(args.end(), workspaces.begin(), workspaces.end()); + for (auto &buffer : input_buffers) { + args.emplace_back(buffer.data); + } + for (auto &buffer : output_buffers) { + args.emplace_back(buffer.data); + } + for (auto &buffer : workspaces_) { + args.emplace_back(buffer); + } if (tiling_buffer_ != nullptr) { GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size()); @@ -254,6 +322,14 @@ Status TbeOpTask::LaunchKernel(const vector &inputs, const vector(args_.get()); + arg_count = arg_size_ / sizeof(void *); + if (tiling_buffer_ != nullptr) { + --arg_count; + } +} + AiCpuBaseTask::~AiCpuBaseTask() { if (ext_info_addr_dev_ != nullptr) { (void)rtFree(ext_info_addr_dev_); @@ -399,12 +475,14 @@ AiCpuTask::~AiCpuTask() { } } -const void *AiCpuTask::GetIOAddr() const { return io_addr_; } - Status AiCpuTask::LaunchKernel(rtStream_t stream) { GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str()); - auto ret = rtMemcpyAsync(workspace_addr_, task_info_.size(), task_info_.data(), task_info_.size(), - RT_MEMCPY_HOST_TO_DEVICE_EX, stream); + auto ret = rtMemcpyAsync(io_addr_, + io_addr_size_, + io_addr_host_.data(), + io_addr_host_.size() * sizeof(void *), + RT_MEMCPY_HOST_TO_DEVICE_EX, + stream); if (ret != RT_ERROR_NONE) { GELOGE(RT_FAILED, "rtMemcpyAsync workspace data failed. ret = %d, task = %s", ret, this->op_type_.c_str()); return RT_FAILED; @@ -680,6 +758,17 @@ Status AiCpuTask::LaunchKernel(const std::vector &input_desc, return SUCCESS; } +Status AiCpuTask::UpdateArgTable(const SingleOpModelParam ¶m) { + auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false); + io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses); + return SUCCESS; +} + +void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { + arg_base = reinterpret_cast(io_addr_host_.data()); + arg_count = io_addr_host_.size(); +} + void AiCpuCCTask::SetKernelArgs(std::unique_ptr args, size_t arg_size) { args_ = std::move(args); arg_size_ = arg_size; @@ -691,9 +780,7 @@ void AiCpuCCTask::SetSoName(const std::string &so_name) { so_name_ = so_name; } void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; } -void AiCpuCCTask::SetIoAddr(void *io_addr) { io_addr_ = io_addr; } - -const void *AiCpuCCTask::GetIOAddr() const { return io_addr_; } +void AiCpuCCTask::SetIoAddr(uintptr_t *io_addr) { io_addr_ = io_addr; } const void *AiCpuCCTask::GetArgs() const { return args_.get(); } @@ -716,12 +803,6 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) { return ret; } GELOGD("Invoke rtCpuKernelLaunch succeeded"); - - size_t input_size = op_desc_->GetInputsSize(); - size_t output_size = op_desc_->GetOutputsSize(); - uint64_t *io_addr = reinterpret_cast(io_addr_); - std::vector io_addrs (io_addr, io_addr + input_size + output_size); - SetIoAddrsForDump(io_addrs); auto status = OpenDump(stream); if (status != SUCCESS) { GELOGE(status, "Open dump failed in the aicpucc single op %s", this->kernel_name_.c_str()); @@ -761,4 +842,9 @@ Status AiCpuCCTask::LaunchKernel(const std::vector &input_desc, return SUCCESS; } + +void AiCpuCCTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { + arg_base = io_addr_; + arg_count = io_addr_num_; +} } // namespace ge diff --git a/ge/single_op/task/op_task.h b/ge/single_op/task/op_task.h index df80088d..04e0def2 100644 --- a/ge/single_op/task/op_task.h +++ b/ge/single_op/task/op_task.h @@ -32,49 +32,27 @@ #include "init/gelib.h" namespace ge { -enum OpTaskType { - OP_TASK_TBE = 0, - OP_TASK_AICPU, - OP_TASK_AICPUCC, - OP_TASK_INVALID, -}; - +class StreamResource; +struct SingleOpModelParam; class OpTask { public: OpTask() = default; virtual ~OpTask() = default; virtual Status LaunchKernel(rtStream_t stream) = 0; virtual Status UpdateRunInfo(const vector &input_desc, - const vector &output_desc) { - return UNSUPPORTED; - } - virtual Status LaunchKernel(const std::vector &inputs, - const std::vector &outputs, - const std::vector &workspaces, - rtStream_t stream) { - return UNSUPPORTED; - } - virtual OpTaskType GetOpTaskType() = 0; - virtual const void *GetIOAddr() const = 0; - const vector &GetWorkspaceSizes() const; - void SetWorkspaceSizes(const vector &workspace_sizes); + const vector &output_desc); + virtual Status UpdateArgTable(const SingleOpModelParam ¶m); void SetModelArgs(std::string model_name, uint32_t model_id); Status GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id, uint32_t &block_dim); const OpDescPtr &GetOpdesc() const {return op_desc_;} Status OpenDump(rtStream_t stream); - void SetIoAddrsForDump(const vector &io_addrs_for_dump) { - io_addrs_for_dump_ = io_addrs_for_dump; - } + virtual void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) = 0; virtual Status LaunchKernel(const std::vector &input_desc, const std::vector &input_buffers, std::vector &output_desc, std::vector &output_buffers, - rtStream_t stream) { - return UNSUPPORTED; - } + rtStream_t stream); - private: - std::vector workspace_sizes_; protected: DumpProperties dump_properties_; DumpOp dump_op_; @@ -82,19 +60,18 @@ class OpTask { std::string model_name_; uint32_t model_id_ = 0; uint32_t block_dim_ = 1; - std::vector io_addrs_for_dump_; }; class TbeOpTask : public OpTask { public: ~TbeOpTask() override; Status LaunchKernel(rtStream_t stream) override; - OpTaskType GetOpTaskType() override { - return OP_TASK_TBE; - } - const void *GetIOAddr() const override { - return nullptr; - } + Status LaunchKernel(const std::vector &input_desc, + const std::vector &input_buffers, + std::vector &output_desc, + std::vector &output_buffers, + rtStream_t stream) override; + void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; void SetSmDesc(void *sm_desc); void SetStubFunc(const std::string &name, const void *stub_func); void SetKernelArgs(std::unique_ptr &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc); @@ -102,20 +79,17 @@ class TbeOpTask : public OpTask { Status UpdateRunInfo(const vector &input_desc, const vector &output_desc) override; - Status LaunchKernel(const vector &inputs, - const vector &outputs, - const vector &workspaces, - rtStream_t stream) override; - const void *GetArgs() const; size_t GetArgSize() const; const std::string &GetStubName() const; void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size); private: + friend class SingleOpModel; static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor); Status UpdateNodeByShape(const vector &input_desc, const vector &output_desc); + Status AllocateWorkspaces(const std::vector &workspace_sizes); const void *stub_func_ = nullptr; std::unique_ptr args_; @@ -123,9 +97,11 @@ class TbeOpTask : public OpTask { void *sm_desc_ = nullptr; std::string stub_name_; + StreamResource *stream_resource_ = nullptr; void *tiling_buffer_ = nullptr; uint32_t max_tiling_size_ = 0; std::string tiling_data_; + std::vector workspaces_; NodePtr node_; }; @@ -133,7 +109,7 @@ class AiCpuBaseTask : public OpTask { public: AiCpuBaseTask() = default; ~AiCpuBaseTask() override; - const UnknowShapeOpType GetUnknownType() const { return unknown_type_; } + UnknowShapeOpType GetUnknownType() const { return unknown_type_; } protected: Status SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id); @@ -158,10 +134,8 @@ class AiCpuTask : public AiCpuBaseTask { ~AiCpuTask() override; Status LaunchKernel(rtStream_t stream) override; - OpTaskType GetOpTaskType() override { - return OP_TASK_AICPU; - } - const void *GetIOAddr() const override; + Status UpdateArgTable(const SingleOpModelParam ¶m) override; + void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; Status LaunchKernel(const std::vector &input_desc, const std::vector &input_buffers, @@ -188,27 +162,31 @@ class AiCpuTask : public AiCpuBaseTask { friend class AiCpuTaskBuilder; void *workspace_addr_ = nullptr; std::string task_info_; - // device addr + // device addr void *args_ = nullptr; size_t arg_size_ = 0; std::string op_type_; // device addr void *io_addr_ = nullptr; + size_t io_addr_size_ = 0; + + // host addr + std::vector io_addr_host_; bool dynamic_flag_ = false; // for copy task - void *copy_task_args_buf_; - void *copy_workspace_buf_; + void *copy_task_args_buf_ = nullptr; + void *copy_workspace_buf_ = nullptr; std::vector output_summary_; std::vector output_summary_host_; - void *copy_ioaddr_dev_; + void *copy_ioaddr_dev_ = nullptr; - void *copy_input_release_flag_dev_; - void *copy_input_data_size_dev_; - void *copy_input_src_dev_; - void *copy_input_dst_dev_; + void *copy_input_release_flag_dev_ = nullptr; + void *copy_input_data_size_dev_ = nullptr; + void *copy_input_src_dev_ = nullptr; + void *copy_input_dst_dev_ = nullptr; vector out_shape_hbm_; uint64_t kernel_id_ = 0; @@ -222,13 +200,12 @@ class AiCpuCCTask : public AiCpuBaseTask { AiCpuCCTask &operator=(const AiCpuCCTask &) = delete; Status LaunchKernel(rtStream_t stream) override; - OpTaskType GetOpTaskType() override { return OP_TASK_AICPUCC; } - const void *GetIOAddr() const override; + void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; const void *GetArgs() const; void SetKernelArgs(std::unique_ptr args, size_t arg_size); void SetSoName(const std::string &so_name); void SetkernelName(const std::string &kernel_Name); - void SetIoAddr(void *io_addr); + void SetIoAddr(uintptr_t *io_addr); size_t GetArgSize() const; Status LaunchKernel(const std::vector &input_desc, @@ -244,7 +221,8 @@ private: std::unique_ptr args_; size_t arg_size_ = 0; void *sm_desc_ = nullptr; - void *io_addr_ = nullptr; + uintptr_t *io_addr_ = nullptr; + size_t io_addr_num_ = 0; bool is_custom_ = false; uint32_t dump_flag_ = RT_KERNEL_DEFAULT; };