@@ -25,6 +25,7 @@ | |||
#include "graph/load/new_model_manager/model_utils.h" | |||
#include "runtime/mem.h" | |||
#include "single_op/single_op_manager.h" | |||
#include "single_op/task/build_task_utils.h" | |||
#include "graph/load/new_model_manager/model_manager.h" | |||
namespace ge { | |||
@@ -77,7 +78,8 @@ Status ProfilingTaskInfo(OpTask *op_task) { | |||
} | |||
} // namespace | |||
SingleOp::SingleOp(std::mutex *stream_mutex, rtStream_t stream) : stream_mutex_(stream_mutex), stream_(stream) { | |||
SingleOp::SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream) | |||
: stream_resource_(stream_resource), stream_mutex_(stream_mutex), stream_(stream) { | |||
} | |||
FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() { | |||
@@ -159,37 +161,6 @@ Status SingleOp::UpdateArgs(const std::vector<DataBuffer> &inputs, const std::ve | |||
*arg_addr = args_[i]; | |||
} | |||
} | |||
// update aicpu_TF or aicpu_CC args | |||
for (auto &task : tasks_) { | |||
size_t io_addr_num = args_.size(); | |||
if (task->GetOpTaskType() == OP_TASK_AICPU) { | |||
GELOGD("Update aicpu_TF task args"); | |||
task->SetIoAddrsForDump(args_); | |||
auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetIOAddr())); | |||
GE_CHECK_NOTNULL(dst_io_addr); | |||
auto rt_ret = rtMemcpyAsync(dst_io_addr, | |||
sizeof(uint64_t) * args_.size(), | |||
&args_[0], | |||
sizeof(uint64_t) * args_.size(), | |||
RT_MEMCPY_HOST_TO_DEVICE_EX, | |||
stream_); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
GELOGE(rt_ret, "rtMemcpyAsync addresses failed, ret = %d", rt_ret); | |||
return rt_ret; | |||
} | |||
} else if (task->GetOpTaskType() == OP_TASK_AICPUCC) { | |||
GELOGD("Update aicpu_CC task args"); | |||
const uintptr_t *task_io_addr = reinterpret_cast<const uintptr_t *>(task->GetIOAddr()); | |||
GE_CHECK_NOTNULL(task_io_addr); | |||
auto io_addr = reinterpret_cast<uint64_t *>(const_cast<uintptr_t *>(task_io_addr)); | |||
for (size_t i = 0; i < io_addr_num; ++i) { | |||
io_addr[i] = static_cast<uintptr_t>(args_[i]); | |||
} | |||
} else { | |||
GELOGW("Only TF_kernel aicpu and aicpu_CC are supported, but got %u", task->GetOpTaskType()); | |||
continue; | |||
} | |||
} | |||
return SUCCESS; | |||
} | |||
@@ -200,7 +171,19 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c | |||
return ret; | |||
} | |||
GE_CHECK_NOTNULL(stream_resource_); | |||
std::lock_guard<std::mutex> lk(*stream_mutex_); | |||
auto current_mem_base = stream_resource_->GetMemoryBase(); | |||
if (running_param_->mem_base != current_mem_base) { | |||
running_param_->mem_base = const_cast<uint8_t *>(current_mem_base); | |||
GELOGD("Memory base changed, new memory base = %p", current_mem_base); | |||
for (auto &task : tasks_) { | |||
auto new_address = BuildTaskUtils::GetAddresses(task->GetOpdesc(), *running_param_); | |||
GE_CHK_STATUS_RET(task->UpdateArgTable(*running_param_), | |||
"[%s] Failed to update arg table", | |||
task->GetOpdesc()->GetName().c_str()); | |||
} | |||
} | |||
ret = UpdateArgs(inputs, outputs); | |||
if (ret != SUCCESS) { | |||
return ret; | |||
@@ -225,9 +208,6 @@ DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex | |||
: resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) { | |||
} | |||
DynamicSingleOp::~DynamicSingleOp() { | |||
} | |||
Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc, | |||
const std::vector<DataBuffer> &inputs, | |||
std::vector<GeTensorDesc> &output_desc, | |||
@@ -249,65 +229,24 @@ Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc, | |||
} | |||
if (input_desc.size() != num_inputs_) { | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Input number mismatches. expect %zu, but given %zu", | |||
num_inputs_, input_desc.size()); | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, | |||
"Input number mismatches. expect %zu, but given %zu", | |||
num_inputs_, | |||
input_desc.size()); | |||
return ACL_ERROR_GE_PARAM_INVALID; | |||
} | |||
if (output_desc.size() != num_outputs_) { | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Output number mismatches. expect %zu, but given %zu", | |||
num_outputs_, output_desc.size()); | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, | |||
"Output number mismatches. expect %zu, but given %zu", | |||
num_outputs_, | |||
output_desc.size()); | |||
return ACL_ERROR_GE_PARAM_INVALID; | |||
} | |||
return SUCCESS; | |||
} | |||
Status DynamicSingleOp::AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes, | |||
std::vector<void *> &workspaces) { | |||
static const std::string kPurpose("malloc workspace memory for dynamic op."); | |||
if (workspace_sizes.empty()) { | |||
GELOGD("No need to allocate workspace."); | |||
return SUCCESS; | |||
} | |||
int64_t total_size = 0; | |||
std::vector<int64_t> ws_offsets; | |||
for (auto ws_size : workspace_sizes) { | |||
// alignment and padding should be done in OpParaCalculate | |||
GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size)); | |||
ws_offsets.emplace_back(total_size); | |||
total_size += ws_size; | |||
} | |||
GELOGD("Total workspace size is %ld", total_size); | |||
StreamResource *stream_resource = SingleOpManager::GetInstance().GetResource(resource_id_, stream_); | |||
GE_CHECK_NOTNULL(stream_resource); | |||
auto ws_base = stream_resource->MallocMemory(kPurpose, static_cast<size_t>(total_size)); | |||
if (ws_base == nullptr) { | |||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size); | |||
return ACL_ERROR_GE_MEMORY_ALLOCATION; | |||
} | |||
GELOGD("Done allocating workspace memory successfully."); | |||
for (auto ws_offset : ws_offsets) { | |||
workspaces.emplace_back(ws_base + ws_offset); | |||
} | |||
return SUCCESS; | |||
} | |||
Status DynamicSingleOp::ExecuteTbeTask(const vector<GeTensorDesc> &input_desc, | |||
const vector<void *> &inputs, | |||
vector<GeTensorDesc> &output_desc, | |||
vector<void *> &outputs) { | |||
GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc)); | |||
std::vector<void *> workspace_buffers; | |||
GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers)); | |||
return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_); | |||
} | |||
Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, | |||
const vector<DataBuffer> &input_buffers, | |||
vector<GeTensorDesc> &output_desc, | |||
@@ -316,32 +255,8 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc, | |||
GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers)); | |||
std::lock_guard<std::mutex> lk(*stream_mutex_); | |||
std::vector<void *> inputs; | |||
std::vector<void *> outputs; | |||
for (auto &buffer : input_buffers) { | |||
inputs.emplace_back(buffer.data); | |||
} | |||
for (auto &buffer : output_buffers) { | |||
outputs.emplace_back(buffer.data); | |||
} | |||
if (op_task_->GetOpTaskType() == OP_TASK_TBE) { | |||
auto ret = ExecuteTbeTask(input_desc, inputs, output_desc, outputs); | |||
if (ret == SUCCESS) { | |||
GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get())); | |||
} | |||
return ret; | |||
} else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) { | |||
auto aicpu_ret = op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_); | |||
if (aicpu_ret == SUCCESS) { | |||
GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get())); | |||
} | |||
return aicpu_ret; | |||
} else { | |||
GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID, | |||
"Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u", | |||
op_task_->GetOpTaskType()); | |||
return ACL_ERROR_GE_OP_TASK_TYPE_INVALID; | |||
} | |||
GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_)); | |||
GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get())); | |||
return SUCCESS; | |||
} | |||
} // namespace ge |
@@ -30,9 +30,11 @@ | |||
#include "cce/aicpu_engine_struct.h" | |||
namespace ge { | |||
class StreamResource; | |||
struct SingleOpModelParam; | |||
class SingleOp { | |||
public: | |||
SingleOp(std::mutex *stream_mutex, rtStream_t stream); | |||
SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream); | |||
~SingleOp(); | |||
Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | |||
@@ -44,6 +46,7 @@ class SingleOp { | |||
Status GetArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs); | |||
friend class SingleOpModel; | |||
StreamResource *stream_resource_; | |||
std::mutex *stream_mutex_; | |||
rtStream_t stream_ = nullptr; | |||
std::vector<void *> input_addr_list_; | |||
@@ -54,12 +57,13 @@ class SingleOp { | |||
std::vector<OpTask *> tasks_; | |||
std::vector<std::vector<uintptr_t *>> arg_table_; | |||
std::unique_ptr<SingleOpModelParam> running_param_; | |||
}; | |||
class DynamicSingleOp { | |||
public: | |||
DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream); | |||
~DynamicSingleOp(); | |||
~DynamicSingleOp() = default; | |||
Status ExecuteAsync(const vector<GeTensorDesc> &input_desc, | |||
const std::vector<DataBuffer> &inputs, | |||
std::vector<GeTensorDesc> &output_desc, | |||
@@ -72,14 +76,6 @@ class DynamicSingleOp { | |||
std::vector<GeTensorDesc> &output_desc, | |||
std::vector<DataBuffer> &outputs) const; | |||
Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes, | |||
std::vector<void *> &workspaces); | |||
Status ExecuteTbeTask(const vector<GeTensorDesc> &input_desc, | |||
const vector<void *> &inputs, | |||
vector<GeTensorDesc> &output_desc, | |||
vector<void *> &outputs); | |||
std::unique_ptr<OpTask> op_task_; | |||
uintptr_t resource_id_ = 0; | |||
std::mutex *stream_mutex_; | |||
@@ -92,7 +92,8 @@ Status SingleOpModel::InitModelMem(StreamResource &res) { | |||
if (model_params_.memory_size > model_params_.zero_copy_mem_size) { | |||
const string purpose("malloc feature map memory on model execute."); | |||
GELOGI("total memory: %lu, zero_copy_mem: %lu", model_params_.memory_size, model_params_.zero_copy_mem_size); | |||
model_params_.mem_base = res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size); | |||
model_params_.mem_base = | |||
res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size, false); | |||
if (model_params_.mem_base == nullptr) { | |||
return ACL_ERROR_GE_MEMORY_ALLOCATION; | |||
} | |||
@@ -226,9 +227,10 @@ Status SingleOpModel::SetInputsAndOutputs(SingleOp &single_op) { | |||
return SUCCESS; | |||
} | |||
Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||
Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &single_op) { | |||
auto ge_model = model_helper_.GetGeModel(); | |||
GE_CHECK_NOTNULL(ge_model); | |||
single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size()); | |||
auto tasks = ge_model->GetModelTaskDefPtr()->task(); | |||
for (int i = 0; i < tasks.size(); ++i) { | |||
const TaskDef &task_def = tasks[i]; | |||
@@ -247,9 +249,11 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||
return ret; | |||
} | |||
single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size()); | |||
ParseArgTable(tbe_task, single_op); | |||
tbe_task->SetModelArgs(model_name_, model_id_); | |||
if (tbe_task->tiling_buffer_ != nullptr) { | |||
tbe_task->stream_resource_ = stream_resource; | |||
} | |||
single_op.tasks_.emplace_back(tbe_task); | |||
} else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) { | |||
GELOGD("Building AICPU_CC task"); | |||
@@ -261,6 +265,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||
return ret; | |||
} | |||
task->SetModelArgs(model_name_, model_id_); | |||
ParseArgTable(task, single_op); | |||
single_op.tasks_.emplace_back(task); | |||
} else { | |||
GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID, | |||
@@ -278,6 +283,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||
return ret; | |||
} | |||
aicpu_task->SetModelArgs(model_name_, model_id_); | |||
ParseArgTable(aicpu_task, single_op); | |||
single_op.tasks_.emplace_back(aicpu_task); | |||
} else { | |||
// skip | |||
@@ -287,21 +293,23 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) { | |||
return SUCCESS; | |||
} | |||
void SingleOpModel::ParseArgTable(TbeOpTask *task, SingleOp &op) { | |||
void SingleOpModel::ParseArgTable(OpTask *task, SingleOp &op) { | |||
if (task == nullptr) { | |||
GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "tbe op task is nullptr"); | |||
return; | |||
} | |||
// args: addr1, addr2, addr3 ... | |||
auto *args = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetArgs())); | |||
size_t arg_size = task->GetArgSize(); | |||
for (size_t i = 0; i < arg_size / sizeof(void *); ++i) { | |||
uintptr_t *ptr_to_addr = args + i; | |||
uintptr_t *arg_base = nullptr; | |||
size_t arg_num = 0; | |||
task->GetIoAddr(arg_base, arg_num); | |||
for (size_t i = 0; i < arg_num; ++i) { | |||
uintptr_t *ptr_to_addr = arg_base + i; | |||
uintptr_t addr = *ptr_to_addr; | |||
auto iter = model_params_.addr_mapping_.find(addr); | |||
if (iter != model_params_.addr_mapping_.end()) { | |||
int arg_index = iter->second; | |||
GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetStubName().c_str(), i, arg_index); | |||
GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetOpdesc()->GetName().c_str(), i, arg_index); | |||
op.arg_table_[iter->second].emplace_back(ptr_to_addr); | |||
} | |||
} | |||
@@ -386,8 +394,10 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa | |||
Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) { | |||
GE_CHK_STATUS_RET_NOLOG(ParseInputsAndOutputs()); | |||
GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource)); | |||
single_op.running_param_.reset(new (std::nothrow)SingleOpModelParam(model_params_)); | |||
GE_CHECK_NOTNULL(single_op.running_param_); | |||
GE_CHK_STATUS_RET_NOLOG(SetInputsAndOutputs(single_op)); | |||
return BuildTaskList(single_op); | |||
return BuildTaskList(&resource, single_op); | |||
} | |||
Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) { | |||
@@ -65,7 +65,7 @@ class SingleOpModel { | |||
Status ParseInputNode(const OpDescPtr &op_desc); | |||
void ParseOutputNode(const OpDescPtr &op_desc); | |||
Status BuildTaskList(SingleOp &single_op); | |||
Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op); | |||
Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op); | |||
Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task); | |||
Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task, | |||
@@ -74,7 +74,7 @@ class SingleOpModel { | |||
Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op); | |||
static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam ¶m); | |||
void ParseArgTable(TbeOpTask *task, SingleOp &op); | |||
void ParseArgTable(OpTask *task, SingleOp &op); | |||
std::string model_name_; | |||
uint32_t model_id_ = 0; | |||
@@ -69,11 +69,25 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose, | |||
size_t size, | |||
size_t &max_allocated, | |||
std::vector<uint8_t *> &allocated) { | |||
if (size == 0) { | |||
GELOGD("Mem size == 0"); | |||
return nullptr; | |||
} | |||
if (size <= max_allocated && !allocated.empty()) { | |||
GELOGD("reuse last memory"); | |||
return allocated.back(); | |||
} | |||
if (!allocated.empty()) { | |||
uint8_t *current_buffer = allocated.back(); | |||
allocated.pop_back(); | |||
if (rtStreamSynchronize(stream_) != RT_ERROR_NONE) { | |||
GELOGW("Failed to invoke rtStreamSynchronize"); | |||
} | |||
(void) rtFree(current_buffer); | |||
} | |||
uint8_t *buffer = nullptr; | |||
auto ret = rtMalloc(reinterpret_cast<void **>(&buffer), size, RT_MEMORY_HBM); | |||
if (ret != RT_ERROR_NONE) { | |||
@@ -96,10 +110,14 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose, | |||
return buffer; | |||
} | |||
uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size) { | |||
uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size, bool holding_lock) { | |||
GELOGD("To Malloc memory, size = %zu", size); | |||
uint8_t *buffer = DoMallocMemory(purpose, size, max_memory_size_, memory_list_); | |||
return buffer; | |||
if (holding_lock) { | |||
return DoMallocMemory(purpose, size, max_memory_size_, memory_list_); | |||
} else { | |||
std::lock_guard<std::mutex> lk(stream_mu_); | |||
return DoMallocMemory(purpose, size, max_memory_size_, memory_list_); | |||
} | |||
} | |||
uint8_t *StreamResource::MallocWeight(const std::string &purpose, size_t size) { | |||
@@ -158,7 +176,7 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData & | |||
return ret; | |||
} | |||
auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(&stream_mu_, stream_)); | |||
auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(this, &stream_mu_, stream_)); | |||
if (new_op == nullptr) { | |||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "new SingleOp failed"); | |||
return ACL_ERROR_GE_MEMORY_ALLOCATION; | |||
@@ -171,4 +189,12 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData & | |||
op_map_[model_data.model_data] = std::move(new_op); | |||
return SUCCESS; | |||
} | |||
const uint8_t *StreamResource::GetMemoryBase() const { | |||
if (memory_list_.empty()) { | |||
return nullptr; | |||
} | |||
return memory_list_.back(); | |||
} | |||
} // namespace ge |
@@ -45,8 +45,9 @@ class StreamResource { | |||
Status BuildOperator(const std::string &model_name, const ModelData &model_data, SingleOp **single_op); | |||
Status BuildDynamicOperator(const std::string &model_name, const ModelData &model_data, DynamicSingleOp **single_op); | |||
uint8_t *MallocMemory(const std::string &purpose, size_t size); | |||
uint8_t *MallocMemory(const std::string &purpose, size_t size, bool holding_lock = true); | |||
uint8_t *MallocWeight(const std::string &purpose, size_t size); | |||
const uint8_t *GetMemoryBase() const; | |||
private: | |||
uint8_t *DoMallocMemory(const std::string &purpose, | |||
@@ -17,17 +17,22 @@ | |||
#include "single_op/task/aicpu_kernel_task_builder.h" | |||
#include "framework/common/taskdown_common.h" | |||
#include "graph/load/new_model_manager/model_manager.h" | |||
#include "build_task_utils.h" | |||
namespace ge { | |||
AiCpuCCTaskBuilder::AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def) | |||
: op_desc_(op_desc), kernel_def_(kernel_def) {} | |||
Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) { | |||
Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam ¶m) { | |||
size_t aicpu_arg_size = kernel_def_.args_size(); | |||
if (aicpu_arg_size <= 0) { | |||
if (aicpu_arg_size <= sizeof(aicpu::AicpuParamHead)) { | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "aicpu_arg_size is invalid, value = %zu", aicpu_arg_size); | |||
return ACL_ERROR_GE_PARAM_INVALID; | |||
} | |||
task.io_addr_num_ = op_desc_->GetInputsSize() + op_desc_->GetOutputsSize(); | |||
GE_CHECK_GE(aicpu_arg_size - sizeof(aicpu::AicpuParamHead), task.io_addr_num_ * sizeof(void *)); | |||
std::unique_ptr<uint8_t[]> aicpu_args; | |||
aicpu_args.reset(new(std::nothrow) uint8_t[aicpu_arg_size]()); | |||
if (aicpu_args == nullptr) { | |||
@@ -41,13 +46,19 @@ Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) { | |||
return ACL_ERROR_GE_INTERNAL_ERROR; | |||
} | |||
task.SetIoAddr(aicpu_args.get() + sizeof(aicpu::AicpuParamHead)); | |||
task.SetIoAddr(reinterpret_cast<uintptr_t *>(aicpu_args.get() + sizeof(aicpu::AicpuParamHead))); | |||
task.SetKernelArgs(std::move(aicpu_args), aicpu_arg_size); | |||
auto addresses = BuildTaskUtils::GetKernelArgs(op_desc_, param); | |||
GE_CHECK_GE(addresses.size(), task.io_addr_num_); | |||
for (size_t i = 0; i < task.io_addr_num_; ++i) { | |||
task.io_addr_[i] = reinterpret_cast<uintptr_t>(addresses[i]); | |||
} | |||
return SUCCESS; | |||
} | |||
Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) { | |||
auto ret = SetKernelArgs(task); | |||
Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam ¶m) { | |||
auto ret = SetKernelArgs(task, param); | |||
if (ret != SUCCESS) { | |||
return ret; | |||
} | |||
@@ -30,10 +30,10 @@ class AiCpuCCTaskBuilder { | |||
explicit AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def); | |||
~AiCpuCCTaskBuilder() = default; | |||
Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id); | |||
Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam ¶m); | |||
private: | |||
Status SetKernelArgs(AiCpuCCTask &task); | |||
Status SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam ¶m); | |||
const OpDescPtr op_desc_; | |||
const domi::KernelDef &kernel_def_; | |||
}; | |||
@@ -26,26 +26,6 @@ namespace ge { | |||
AiCpuTaskBuilder::AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def) | |||
: op_desc_(op_desc), kernel_def_(kernel_def) {} | |||
Status AiCpuTaskBuilder::SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses) { | |||
size_t arg_size = kernel_def_.args_size(); | |||
auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
GELOGE(rt_ret, "rtMalloc failed, size = %zu, ret = %d", arg_size, rt_ret); | |||
return rt_ret; | |||
} | |||
const void *src_addr = reinterpret_cast<const void *>(addresses.data()); | |||
uint64_t src_len = sizeof(void *) * addresses.size(); | |||
rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_DEVICE); | |||
if (rt_ret != RT_ERROR_NONE) { | |||
(void)rtFree(*io_addr); | |||
GELOGE(rt_ret, "rtMemcpy addresses failed, ret = %d", rt_ret); | |||
return rt_ret; | |||
} | |||
return SUCCESS; | |||
} | |||
Status AiCpuTaskBuilder::SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &fwk_op_kernel) { | |||
auto sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL), | |||
kernel_def_.args().data(), kernel_def_.args().size()); | |||
@@ -80,39 +60,27 @@ namespace ge { | |||
return SUCCESS; | |||
} | |||
Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, | |||
const SingleOpModelParam ¶m, bool dynamic_flag) { | |||
Status AiCpuTaskBuilder::InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam ¶m, bool dynamic_flag) { | |||
if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) { | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d", | |||
sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size()); | |||
return ACL_ERROR_GE_PARAM_INVALID; | |||
} | |||
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param); | |||
auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace); | |||
if (dynamic_flag) { | |||
GE_CHK_RT_RET(rtMalloc(kernel_workspace, kernel_def_.task_info_size(), RT_MEMORY_HBM)); | |||
} else { | |||
if (ws_addr_vec.empty()) { | |||
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "workspace Data Address is empty."); | |||
return ACL_ERROR_GE_PARAM_INVALID; | |||
} | |||
*kernel_workspace = ws_addr_vec[0]; | |||
} | |||
GE_CHK_RT_RET(rtMemcpy(*kernel_workspace, kernel_def_.task_info_size(), | |||
GE_CHK_RT_RET(rtMalloc(&task.workspace_addr_, kernel_def_.task_info_size(), RT_MEMORY_HBM)); | |||
GE_CHK_RT_RET(rtMemcpy(task.workspace_addr_, kernel_def_.task_info_size(), | |||
kernel_def_.task_info().data(), kernel_def_.task_info_size(), | |||
RT_MEMCPY_HOST_TO_DEVICE)); | |||
auto ret = SetInputOutputAddr(io_addr, BuildTaskUtils::JoinAddresses(addresses)); | |||
if (ret != SUCCESS) { | |||
return ret; | |||
} | |||
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false); | |||
task.io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses); | |||
task.io_addr_size_ = task.io_addr_host_.size() * sizeof(void *); | |||
GE_CHK_RT_RET(rtMalloc(&task.io_addr_, task.io_addr_size_, RT_MEMORY_HBM)); | |||
return SUCCESS; | |||
} | |||
Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam ¶m, | |||
bool dynamic_flag, uint64_t kernel_id) { | |||
GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&task.io_addr_, &task.workspace_addr_, param, dynamic_flag)); | |||
GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(task, param, dynamic_flag)); | |||
STR_FWK_OP_KERNEL fwk_op_kernel = {0}; | |||
auto ret = SetFmkOpKernel(task.io_addr_, task.workspace_addr_, fwk_op_kernel); | |||
@@ -33,10 +33,8 @@ namespace ge { | |||
private: | |||
static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel); | |||
Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses); | |||
Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel); | |||
Status InitWorkspaceAndIO(void **io_addr, void **kernel_workspace, | |||
const SingleOpModelParam ¶m, bool dynamic_flag); | |||
Status InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam ¶m, bool dynamic_flag); | |||
const OpDescPtr op_desc_; | |||
const domi::KernelExDef &kernel_def_; | |||
@@ -32,7 +32,8 @@ const uint64_t kVarSize = 0; | |||
} | |||
std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &op_desc, | |||
const SingleOpModelParam ¶m) { | |||
const SingleOpModelParam ¶m, | |||
bool keep_workspace) { | |||
std::vector<std::vector<void *>> ret; | |||
RuntimeParam runtime_para; | |||
runtime_para.mem_size = param.memory_size; | |||
@@ -49,7 +50,9 @@ std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &o | |||
ret.emplace_back(ModelUtils::GetInputDataAddrs(runtime_para, op_desc)); | |||
ret.emplace_back(ModelUtils::GetOutputDataAddrs(runtime_para, op_desc)); | |||
ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc)); | |||
if (keep_workspace) { | |||
ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc)); | |||
} | |||
return ret; | |||
} | |||
@@ -27,15 +27,17 @@ | |||
namespace ge { | |||
class BuildTaskUtils { | |||
public: | |||
static constexpr int kAddressIndexOutput = 1; | |||
static constexpr int kAddressIndexWorkspace = 2; | |||
static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc, const SingleOpModelParam ¶m); | |||
static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc, | |||
const SingleOpModelParam ¶m, | |||
bool keep_workspace = true); | |||
static std::vector<void *> JoinAddresses(const std::vector<std::vector<void *>> &addresses); | |||
static std::vector<void *> GetKernelArgs(const OpDescPtr &op_desc, const SingleOpModelParam ¶m); | |||
static std::string GetTaskInfo(const OpDescPtr &op_desc); | |||
template<typename T> | |||
static std::string VectorToString(const std::vector<T> &values) | |||
{ | |||
static std::string VectorToString(const std::vector<T> &values) { | |||
std::stringstream ss; | |||
ss << '['; | |||
auto size = values.size(); | |||
@@ -24,9 +24,11 @@ | |||
#include "common/dump/dump_manager.h" | |||
#include "common/dump/dump_op.h" | |||
#include "common/formats/formats.h" | |||
#include "common/math/math_util.h" | |||
#include "framework/common/debug/log.h" | |||
#include "register/op_tiling.h" | |||
#include "runtime/rt.h" | |||
#include "build_task_utils.h" | |||
namespace ge { | |||
namespace { | |||
@@ -48,18 +50,22 @@ Status OpTask::OpenDump(rtStream_t stream) { | |||
std::vector<uint64_t> output_adds; | |||
auto input_size = op_desc_->GetInputsSize(); | |||
auto output_size = op_desc_->GetOutputsSize(); | |||
auto all_size = io_addrs_for_dump_.size(); | |||
if (input_size + output_size != all_size) { | |||
GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu", all_size, | |||
uintptr_t *arg_base = nullptr; | |||
size_t arg_num = 0; | |||
GetIoAddr(arg_base, arg_num); | |||
if (arg_num < input_size + output_size) { | |||
GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu", | |||
arg_num, | |||
input_size + output_size); | |||
return FAILED; | |||
} | |||
for (size_t i = 0; i < input_size; i++) { | |||
uint64_t input_addr = io_addrs_for_dump_[i]; | |||
uint64_t input_addr = arg_base[i]; | |||
input_addrs.emplace_back(input_addr); | |||
} | |||
for (size_t j = 0; j < output_size; j++) { | |||
uint64_t output_addr = io_addrs_for_dump_[input_size + j]; | |||
uint64_t output_addr = arg_base[input_size + j]; | |||
output_adds.emplace_back(output_addr); | |||
} | |||
dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream); | |||
@@ -89,10 +95,6 @@ void TbeOpTask::SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size | |||
void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; } | |||
const vector<int64_t> &OpTask::GetWorkspaceSizes() const { return workspace_sizes_; } | |||
void OpTask::SetWorkspaceSizes(const vector<int64_t> &workspace_sizes) { workspace_sizes_ = workspace_sizes; } | |||
void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) { | |||
model_name_ = model_name; | |||
model_id_ = model_id; | |||
@@ -107,6 +109,36 @@ Status OpTask::GetProfilingArgs(std::string &model_name, std::string &op_name, u | |||
op_name = op_desc_->GetName(); | |||
return SUCCESS; | |||
} | |||
Status OpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) { | |||
return UNSUPPORTED; | |||
} | |||
Status OpTask::UpdateArgTable(const SingleOpModelParam ¶m) { | |||
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param); | |||
auto all_addresses = BuildTaskUtils::JoinAddresses(addresses); | |||
uintptr_t *arg_base = nullptr; | |||
size_t arg_num = 0; | |||
GetIoAddr(arg_base, arg_num); | |||
if (arg_num != all_addresses.size()) { | |||
GELOGE(INTERNAL_ERROR, "[%s] arg number mismatches, expect = %zu, but got = %zu", | |||
op_desc_->GetName().c_str(), | |||
arg_num, | |||
all_addresses.size()); | |||
return INTERNAL_ERROR; | |||
} | |||
for (void *addr : all_addresses) { | |||
*arg_base++ = reinterpret_cast<uintptr_t >(addr); | |||
} | |||
return SUCCESS; | |||
} | |||
Status OpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc, | |||
const vector<DataBuffer> &input_buffers, | |||
vector<GeTensorDesc> &output_desc, | |||
vector<DataBuffer> &output_buffers, | |||
rtStream_t stream) { | |||
return UNSUPPORTED; | |||
} | |||
TbeOpTask::~TbeOpTask() { | |||
if (sm_desc_ != nullptr) { | |||
@@ -141,12 +173,6 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) { | |||
return RT_FAILED; | |||
} | |||
GELOGI("[TASK_INFO] %s", this->stub_name_.c_str()); | |||
size_t input_size = op_desc_->GetInputsSize(); | |||
size_t output_size = op_desc_->GetOutputsSize(); | |||
uint64_t *io_addr = reinterpret_cast<uint64_t *>(args_.get()); | |||
std::vector<uint64_t> io_addrs(io_addr, io_addr + input_size + output_size); | |||
SetIoAddrsForDump(io_addrs); | |||
auto status = OpenDump(stream); | |||
if (status != SUCCESS) { | |||
GELOGE(status, "Open dump failed in the tbe single op %s", this->stub_name_.c_str()); | |||
@@ -167,11 +193,12 @@ Status TbeOpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const ve | |||
GELOGE(FAILED, "Failed to invoke OpParaCalculate. ret = %u", ret); | |||
return FAILED; | |||
} | |||
SetWorkspaceSizes(run_info.workspaces); | |||
block_dim_ = run_info.block_dim; | |||
tiling_data_ = run_info.tiling_data.str(); | |||
GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu", block_dim_, | |||
tiling_data_.size()); | |||
GE_CHK_STATUS_RET(AllocateWorkspaces(run_info.workspaces), "Failed to allocate workspaces"); | |||
return SUCCESS; | |||
} | |||
@@ -227,13 +254,54 @@ void TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, s | |||
max_tiling_size_ = max_tiling_size; | |||
} | |||
Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *> &outputs, | |||
const vector<void *> &workspaces, rtStream_t stream) { | |||
Status TbeOpTask::AllocateWorkspaces(const vector<int64_t> &workspace_sizes) { | |||
static const std::string kPurpose("malloc workspace memory for dynamic op."); | |||
if (workspace_sizes.empty()) { | |||
GELOGD("No need to allocate workspace."); | |||
return SUCCESS; | |||
} | |||
int64_t total_size = 0; | |||
std::vector<int64_t> ws_offsets; | |||
for (auto ws_size : workspace_sizes) { | |||
// alignment and padding should be done in OpParaCalculate | |||
GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size)); | |||
ws_offsets.emplace_back(total_size); | |||
total_size += ws_size; | |||
} | |||
GELOGD("Total workspace size is %ld", total_size); | |||
GE_CHECK_NOTNULL(stream_resource_); | |||
auto ws_base = stream_resource_->MallocMemory(kPurpose, static_cast<size_t>(total_size)); | |||
if (ws_base == nullptr) { | |||
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size); | |||
return ACL_ERROR_GE_MEMORY_ALLOCATION; | |||
} | |||
GELOGD("Done allocating workspace memory successfully."); | |||
for (auto ws_offset : ws_offsets) { | |||
workspaces_.emplace_back(ws_base + ws_offset); | |||
} | |||
return SUCCESS; | |||
} | |||
Status TbeOpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc, | |||
const vector<DataBuffer> &input_buffers, | |||
vector<GeTensorDesc> &output_desc, | |||
vector<DataBuffer> &output_buffers, | |||
rtStream_t stream) { | |||
GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo(input_desc, output_desc)); | |||
GELOGD("[%s] Start to launch kernel", node_->GetName().c_str()); | |||
std::vector<void *> args; | |||
args.insert(args.end(), inputs.begin(), inputs.end()); | |||
args.insert(args.end(), outputs.begin(), outputs.end()); | |||
args.insert(args.end(), workspaces.begin(), workspaces.end()); | |||
for (auto &buffer : input_buffers) { | |||
args.emplace_back(buffer.data); | |||
} | |||
for (auto &buffer : output_buffers) { | |||
args.emplace_back(buffer.data); | |||
} | |||
for (auto &buffer : workspaces_) { | |||
args.emplace_back(buffer); | |||
} | |||
if (tiling_buffer_ != nullptr) { | |||
GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size()); | |||
@@ -254,6 +322,14 @@ Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void * | |||
return SUCCESS; | |||
} | |||
void TbeOpTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { | |||
arg_base = reinterpret_cast<uintptr_t *>(args_.get()); | |||
arg_count = arg_size_ / sizeof(void *); | |||
if (tiling_buffer_ != nullptr) { | |||
--arg_count; | |||
} | |||
} | |||
AiCpuBaseTask::~AiCpuBaseTask() { | |||
if (ext_info_addr_dev_ != nullptr) { | |||
(void)rtFree(ext_info_addr_dev_); | |||
@@ -399,12 +475,14 @@ AiCpuTask::~AiCpuTask() { | |||
} | |||
} | |||
const void *AiCpuTask::GetIOAddr() const { return io_addr_; } | |||
Status AiCpuTask::LaunchKernel(rtStream_t stream) { | |||
GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str()); | |||
auto ret = rtMemcpyAsync(workspace_addr_, task_info_.size(), task_info_.data(), task_info_.size(), | |||
RT_MEMCPY_HOST_TO_DEVICE_EX, stream); | |||
auto ret = rtMemcpyAsync(io_addr_, | |||
io_addr_size_, | |||
io_addr_host_.data(), | |||
io_addr_host_.size() * sizeof(void *), | |||
RT_MEMCPY_HOST_TO_DEVICE_EX, | |||
stream); | |||
if (ret != RT_ERROR_NONE) { | |||
GELOGE(RT_FAILED, "rtMemcpyAsync workspace data failed. ret = %d, task = %s", ret, this->op_type_.c_str()); | |||
return RT_FAILED; | |||
@@ -680,6 +758,17 @@ Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | |||
return SUCCESS; | |||
} | |||
Status AiCpuTask::UpdateArgTable(const SingleOpModelParam ¶m) { | |||
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false); | |||
io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses); | |||
return SUCCESS; | |||
} | |||
void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { | |||
arg_base = reinterpret_cast<uintptr_t *>(io_addr_host_.data()); | |||
arg_count = io_addr_host_.size(); | |||
} | |||
void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) { | |||
args_ = std::move(args); | |||
arg_size_ = arg_size; | |||
@@ -691,9 +780,7 @@ void AiCpuCCTask::SetSoName(const std::string &so_name) { so_name_ = so_name; } | |||
void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; } | |||
void AiCpuCCTask::SetIoAddr(void *io_addr) { io_addr_ = io_addr; } | |||
const void *AiCpuCCTask::GetIOAddr() const { return io_addr_; } | |||
void AiCpuCCTask::SetIoAddr(uintptr_t *io_addr) { io_addr_ = io_addr; } | |||
const void *AiCpuCCTask::GetArgs() const { return args_.get(); } | |||
@@ -716,12 +803,6 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) { | |||
return ret; | |||
} | |||
GELOGD("Invoke rtCpuKernelLaunch succeeded"); | |||
size_t input_size = op_desc_->GetInputsSize(); | |||
size_t output_size = op_desc_->GetOutputsSize(); | |||
uint64_t *io_addr = reinterpret_cast<uint64_t *>(io_addr_); | |||
std::vector<uint64_t> io_addrs (io_addr, io_addr + input_size + output_size); | |||
SetIoAddrsForDump(io_addrs); | |||
auto status = OpenDump(stream); | |||
if (status != SUCCESS) { | |||
GELOGE(status, "Open dump failed in the aicpucc single op %s", this->kernel_name_.c_str()); | |||
@@ -761,4 +842,9 @@ Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | |||
return SUCCESS; | |||
} | |||
void AiCpuCCTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) { | |||
arg_base = io_addr_; | |||
arg_count = io_addr_num_; | |||
} | |||
} // namespace ge |
@@ -32,49 +32,27 @@ | |||
#include "init/gelib.h" | |||
namespace ge { | |||
enum OpTaskType { | |||
OP_TASK_TBE = 0, | |||
OP_TASK_AICPU, | |||
OP_TASK_AICPUCC, | |||
OP_TASK_INVALID, | |||
}; | |||
class StreamResource; | |||
struct SingleOpModelParam; | |||
class OpTask { | |||
public: | |||
OpTask() = default; | |||
virtual ~OpTask() = default; | |||
virtual Status LaunchKernel(rtStream_t stream) = 0; | |||
virtual Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc, | |||
const vector<GeTensorDesc> &output_desc) { | |||
return UNSUPPORTED; | |||
} | |||
virtual Status LaunchKernel(const std::vector<void *> &inputs, | |||
const std::vector<void *> &outputs, | |||
const std::vector<void *> &workspaces, | |||
rtStream_t stream) { | |||
return UNSUPPORTED; | |||
} | |||
virtual OpTaskType GetOpTaskType() = 0; | |||
virtual const void *GetIOAddr() const = 0; | |||
const vector<int64_t> &GetWorkspaceSizes() const; | |||
void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes); | |||
const vector<GeTensorDesc> &output_desc); | |||
virtual Status UpdateArgTable(const SingleOpModelParam ¶m); | |||
void SetModelArgs(std::string model_name, uint32_t model_id); | |||
Status GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id, uint32_t &block_dim); | |||
const OpDescPtr &GetOpdesc() const {return op_desc_;} | |||
Status OpenDump(rtStream_t stream); | |||
void SetIoAddrsForDump(const vector<uint64_t> &io_addrs_for_dump) { | |||
io_addrs_for_dump_ = io_addrs_for_dump; | |||
} | |||
virtual void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) = 0; | |||
virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | |||
const std::vector<DataBuffer> &input_buffers, | |||
std::vector<GeTensorDesc> &output_desc, | |||
std::vector<DataBuffer> &output_buffers, | |||
rtStream_t stream) { | |||
return UNSUPPORTED; | |||
} | |||
rtStream_t stream); | |||
private: | |||
std::vector<int64_t> workspace_sizes_; | |||
protected: | |||
DumpProperties dump_properties_; | |||
DumpOp dump_op_; | |||
@@ -82,19 +60,18 @@ class OpTask { | |||
std::string model_name_; | |||
uint32_t model_id_ = 0; | |||
uint32_t block_dim_ = 1; | |||
std::vector<uint64_t> io_addrs_for_dump_; | |||
}; | |||
class TbeOpTask : public OpTask { | |||
public: | |||
~TbeOpTask() override; | |||
Status LaunchKernel(rtStream_t stream) override; | |||
OpTaskType GetOpTaskType() override { | |||
return OP_TASK_TBE; | |||
} | |||
const void *GetIOAddr() const override { | |||
return nullptr; | |||
} | |||
Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | |||
const std::vector<DataBuffer> &input_buffers, | |||
std::vector<GeTensorDesc> &output_desc, | |||
std::vector<DataBuffer> &output_buffers, | |||
rtStream_t stream) override; | |||
void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; | |||
void SetSmDesc(void *sm_desc); | |||
void SetStubFunc(const std::string &name, const void *stub_func); | |||
void SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc); | |||
@@ -102,20 +79,17 @@ class TbeOpTask : public OpTask { | |||
Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc, | |||
const vector<GeTensorDesc> &output_desc) override; | |||
Status LaunchKernel(const vector<void *> &inputs, | |||
const vector<void *> &outputs, | |||
const vector<void *> &workspaces, | |||
rtStream_t stream) override; | |||
const void *GetArgs() const; | |||
size_t GetArgSize() const; | |||
const std::string &GetStubName() const; | |||
void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size); | |||
private: | |||
friend class SingleOpModel; | |||
static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor); | |||
Status UpdateNodeByShape(const vector<GeTensorDesc> &input_desc, | |||
const vector<GeTensorDesc> &output_desc); | |||
Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes); | |||
const void *stub_func_ = nullptr; | |||
std::unique_ptr<uint8_t[]> args_; | |||
@@ -123,9 +97,11 @@ class TbeOpTask : public OpTask { | |||
void *sm_desc_ = nullptr; | |||
std::string stub_name_; | |||
StreamResource *stream_resource_ = nullptr; | |||
void *tiling_buffer_ = nullptr; | |||
uint32_t max_tiling_size_ = 0; | |||
std::string tiling_data_; | |||
std::vector<void *> workspaces_; | |||
NodePtr node_; | |||
}; | |||
@@ -133,7 +109,7 @@ class AiCpuBaseTask : public OpTask { | |||
public: | |||
AiCpuBaseTask() = default; | |||
~AiCpuBaseTask() override; | |||
const UnknowShapeOpType GetUnknownType() const { return unknown_type_; } | |||
UnknowShapeOpType GetUnknownType() const { return unknown_type_; } | |||
protected: | |||
Status SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id); | |||
@@ -158,10 +134,8 @@ class AiCpuTask : public AiCpuBaseTask { | |||
~AiCpuTask() override; | |||
Status LaunchKernel(rtStream_t stream) override; | |||
OpTaskType GetOpTaskType() override { | |||
return OP_TASK_AICPU; | |||
} | |||
const void *GetIOAddr() const override; | |||
Status UpdateArgTable(const SingleOpModelParam ¶m) override; | |||
void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; | |||
Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | |||
const std::vector<DataBuffer> &input_buffers, | |||
@@ -188,27 +162,31 @@ class AiCpuTask : public AiCpuBaseTask { | |||
friend class AiCpuTaskBuilder; | |||
void *workspace_addr_ = nullptr; | |||
std::string task_info_; | |||
// device addr | |||
// device addr | |||
void *args_ = nullptr; | |||
size_t arg_size_ = 0; | |||
std::string op_type_; | |||
// device addr | |||
void *io_addr_ = nullptr; | |||
size_t io_addr_size_ = 0; | |||
// host addr | |||
std::vector<void *> io_addr_host_; | |||
bool dynamic_flag_ = false; | |||
// for copy task | |||
void *copy_task_args_buf_; | |||
void *copy_workspace_buf_; | |||
void *copy_task_args_buf_ = nullptr; | |||
void *copy_workspace_buf_ = nullptr; | |||
std::vector<void *> output_summary_; | |||
std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_; | |||
void *copy_ioaddr_dev_; | |||
void *copy_ioaddr_dev_ = nullptr; | |||
void *copy_input_release_flag_dev_; | |||
void *copy_input_data_size_dev_; | |||
void *copy_input_src_dev_; | |||
void *copy_input_dst_dev_; | |||
void *copy_input_release_flag_dev_ = nullptr; | |||
void *copy_input_data_size_dev_ = nullptr; | |||
void *copy_input_src_dev_ = nullptr; | |||
void *copy_input_dst_dev_ = nullptr; | |||
vector<void *> out_shape_hbm_; | |||
uint64_t kernel_id_ = 0; | |||
@@ -222,13 +200,12 @@ class AiCpuCCTask : public AiCpuBaseTask { | |||
AiCpuCCTask &operator=(const AiCpuCCTask &) = delete; | |||
Status LaunchKernel(rtStream_t stream) override; | |||
OpTaskType GetOpTaskType() override { return OP_TASK_AICPUCC; } | |||
const void *GetIOAddr() const override; | |||
void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override; | |||
const void *GetArgs() const; | |||
void SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size); | |||
void SetSoName(const std::string &so_name); | |||
void SetkernelName(const std::string &kernel_Name); | |||
void SetIoAddr(void *io_addr); | |||
void SetIoAddr(uintptr_t *io_addr); | |||
size_t GetArgSize() const; | |||
Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc, | |||
@@ -244,7 +221,8 @@ private: | |||
std::unique_ptr<uint8_t[]> args_; | |||
size_t arg_size_ = 0; | |||
void *sm_desc_ = nullptr; | |||
void *io_addr_ = nullptr; | |||
uintptr_t *io_addr_ = nullptr; | |||
size_t io_addr_num_ = 0; | |||
bool is_custom_ = false; | |||
uint32_t dump_flag_ = RT_KERNEL_DEFAULT; | |||
}; | |||