Browse Source

ACL single op refactory

tags/v1.2.0
chuxing 4 years ago
parent
commit
b373acb0e2
14 changed files with 278 additions and 284 deletions
  1. +26
    -111
      ge/single_op/single_op.cc
  2. +6
    -10
      ge/single_op/single_op.h
  3. +20
    -10
      ge/single_op/single_op_model.cc
  4. +2
    -2
      ge/single_op/single_op_model.h
  5. +30
    -4
      ge/single_op/stream_resource.cc
  6. +2
    -1
      ge/single_op/stream_resource.h
  7. +16
    -5
      ge/single_op/task/aicpu_kernel_task_builder.cc
  8. +2
    -2
      ge/single_op/task/aicpu_kernel_task_builder.h
  9. +8
    -40
      ge/single_op/task/aicpu_task_builder.cc
  10. +1
    -3
      ge/single_op/task/aicpu_task_builder.h
  11. +5
    -2
      ge/single_op/task/build_task_utils.cc
  12. +5
    -3
      ge/single_op/task/build_task_utils.h
  13. +120
    -34
      ge/single_op/task/op_task.cc
  14. +35
    -57
      ge/single_op/task/op_task.h

+ 26
- 111
ge/single_op/single_op.cc View File

@@ -25,6 +25,7 @@
#include "graph/load/new_model_manager/model_utils.h"
#include "runtime/mem.h"
#include "single_op/single_op_manager.h"
#include "single_op/task/build_task_utils.h"
#include "graph/load/new_model_manager/model_manager.h"

namespace ge {
@@ -77,7 +78,8 @@ Status ProfilingTaskInfo(OpTask *op_task) {
}
} // namespace

SingleOp::SingleOp(std::mutex *stream_mutex, rtStream_t stream) : stream_mutex_(stream_mutex), stream_(stream) {
SingleOp::SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream)
: stream_resource_(stream_resource), stream_mutex_(stream_mutex), stream_(stream) {
}

FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY SingleOp::~SingleOp() {
@@ -159,37 +161,6 @@ Status SingleOp::UpdateArgs(const std::vector<DataBuffer> &inputs, const std::ve
*arg_addr = args_[i];
}
}
// update aicpu_TF or aicpu_CC args
for (auto &task : tasks_) {
size_t io_addr_num = args_.size();
if (task->GetOpTaskType() == OP_TASK_AICPU) {
GELOGD("Update aicpu_TF task args");
task->SetIoAddrsForDump(args_);
auto *dst_io_addr = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetIOAddr()));
GE_CHECK_NOTNULL(dst_io_addr);
auto rt_ret = rtMemcpyAsync(dst_io_addr,
sizeof(uint64_t) * args_.size(),
&args_[0],
sizeof(uint64_t) * args_.size(),
RT_MEMCPY_HOST_TO_DEVICE_EX,
stream_);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(rt_ret, "rtMemcpyAsync addresses failed, ret = %d", rt_ret);
return rt_ret;
}
} else if (task->GetOpTaskType() == OP_TASK_AICPUCC) {
GELOGD("Update aicpu_CC task args");
const uintptr_t *task_io_addr = reinterpret_cast<const uintptr_t *>(task->GetIOAddr());
GE_CHECK_NOTNULL(task_io_addr);
auto io_addr = reinterpret_cast<uint64_t *>(const_cast<uintptr_t *>(task_io_addr));
for (size_t i = 0; i < io_addr_num; ++i) {
io_addr[i] = static_cast<uintptr_t>(args_[i]);
}
} else {
GELOGW("Only TF_kernel aicpu and aicpu_CC are supported, but got %u", task->GetOpTaskType());
continue;
}
}
return SUCCESS;
}

@@ -200,7 +171,19 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY Status SingleOp::ExecuteAsync(c
return ret;
}

GE_CHECK_NOTNULL(stream_resource_);
std::lock_guard<std::mutex> lk(*stream_mutex_);
auto current_mem_base = stream_resource_->GetMemoryBase();
if (running_param_->mem_base != current_mem_base) {
running_param_->mem_base = const_cast<uint8_t *>(current_mem_base);
GELOGD("Memory base changed, new memory base = %p", current_mem_base);
for (auto &task : tasks_) {
auto new_address = BuildTaskUtils::GetAddresses(task->GetOpdesc(), *running_param_);
GE_CHK_STATUS_RET(task->UpdateArgTable(*running_param_),
"[%s] Failed to update arg table",
task->GetOpdesc()->GetName().c_str());
}
}
ret = UpdateArgs(inputs, outputs);
if (ret != SUCCESS) {
return ret;
@@ -225,9 +208,6 @@ DynamicSingleOp::DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex
: resource_id_(resource_id), stream_mutex_(stream_mutex), stream_(stream) {
}

DynamicSingleOp::~DynamicSingleOp() {
}

Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc,
const std::vector<DataBuffer> &inputs,
std::vector<GeTensorDesc> &output_desc,
@@ -249,65 +229,24 @@ Status DynamicSingleOp::ValidateParams(const vector<GeTensorDesc> &input_desc,
}

if (input_desc.size() != num_inputs_) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Input number mismatches. expect %zu, but given %zu",
num_inputs_, input_desc.size());
GELOGE(ACL_ERROR_GE_PARAM_INVALID,
"Input number mismatches. expect %zu, but given %zu",
num_inputs_,
input_desc.size());
return ACL_ERROR_GE_PARAM_INVALID;
}

if (output_desc.size() != num_outputs_) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "Output number mismatches. expect %zu, but given %zu",
num_outputs_, output_desc.size());
GELOGE(ACL_ERROR_GE_PARAM_INVALID,
"Output number mismatches. expect %zu, but given %zu",
num_outputs_,
output_desc.size());
return ACL_ERROR_GE_PARAM_INVALID;
}

return SUCCESS;
}

Status DynamicSingleOp::AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes,
std::vector<void *> &workspaces) {
static const std::string kPurpose("malloc workspace memory for dynamic op.");
if (workspace_sizes.empty()) {
GELOGD("No need to allocate workspace.");
return SUCCESS;
}
int64_t total_size = 0;
std::vector<int64_t> ws_offsets;
for (auto ws_size : workspace_sizes) {
// alignment and padding should be done in OpParaCalculate
GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size));
ws_offsets.emplace_back(total_size);
total_size += ws_size;
}

GELOGD("Total workspace size is %ld", total_size);
StreamResource *stream_resource = SingleOpManager::GetInstance().GetResource(resource_id_, stream_);
GE_CHECK_NOTNULL(stream_resource);
auto ws_base = stream_resource->MallocMemory(kPurpose, static_cast<size_t>(total_size));
if (ws_base == nullptr) {
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size);
return ACL_ERROR_GE_MEMORY_ALLOCATION;
}
GELOGD("Done allocating workspace memory successfully.");

for (auto ws_offset : ws_offsets) {
workspaces.emplace_back(ws_base + ws_offset);
}

return SUCCESS;
}

Status DynamicSingleOp::ExecuteTbeTask(const vector<GeTensorDesc> &input_desc,
const vector<void *> &inputs,
vector<GeTensorDesc> &output_desc,
vector<void *> &outputs) {
GE_CHK_STATUS_RET_NOLOG(op_task_->UpdateRunInfo(input_desc, output_desc));

std::vector<void *> workspace_buffers;
GE_CHK_STATUS_RET_NOLOG(AllocateWorkspaces(op_task_->GetWorkspaceSizes(), workspace_buffers));

return op_task_->LaunchKernel(inputs, outputs, workspace_buffers, stream_);
}

Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
const vector<DataBuffer> &input_buffers,
vector<GeTensorDesc> &output_desc,
@@ -316,32 +255,8 @@ Status DynamicSingleOp::ExecuteAsync(const vector<GeTensorDesc> &input_desc,
GE_CHK_STATUS_RET_NOLOG(ValidateParams(input_desc, input_buffers, output_desc, output_buffers));
std::lock_guard<std::mutex> lk(*stream_mutex_);

std::vector<void *> inputs;
std::vector<void *> outputs;
for (auto &buffer : input_buffers) {
inputs.emplace_back(buffer.data);
}
for (auto &buffer : output_buffers) {
outputs.emplace_back(buffer.data);
}

if (op_task_->GetOpTaskType() == OP_TASK_TBE) {
auto ret = ExecuteTbeTask(input_desc, inputs, output_desc, outputs);
if (ret == SUCCESS) {
GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get()));
}
return ret;
} else if (op_task_->GetOpTaskType() == OP_TASK_AICPU || op_task_->GetOpTaskType() == OP_TASK_AICPUCC) {
auto aicpu_ret = op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_);
if (aicpu_ret == SUCCESS) {
GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get()));
}
return aicpu_ret;
} else {
GELOGE(ACL_ERROR_GE_OP_TASK_TYPE_INVALID,
"Only TBE_Task, AI_CPU_Task and AI_CPUCC_Task are supported, but got %u",
op_task_->GetOpTaskType());
return ACL_ERROR_GE_OP_TASK_TYPE_INVALID;
}
GE_CHK_STATUS_RET_NOLOG(op_task_->LaunchKernel(input_desc, input_buffers, output_desc, output_buffers, stream_));
GE_CHK_STATUS_RET_NOLOG(ProfilingTaskInfo(op_task_.get()));
return SUCCESS;
}
} // namespace ge

+ 6
- 10
ge/single_op/single_op.h View File

@@ -30,9 +30,11 @@
#include "cce/aicpu_engine_struct.h"

namespace ge {
class StreamResource;
struct SingleOpModelParam;
class SingleOp {
public:
SingleOp(std::mutex *stream_mutex, rtStream_t stream);
SingleOp(StreamResource *stream_resource, std::mutex *stream_mutex, rtStream_t stream);
~SingleOp();

Status ExecuteAsync(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);
@@ -44,6 +46,7 @@ class SingleOp {
Status GetArgs(const std::vector<DataBuffer> &inputs, const std::vector<DataBuffer> &outputs);

friend class SingleOpModel;
StreamResource *stream_resource_;
std::mutex *stream_mutex_;
rtStream_t stream_ = nullptr;
std::vector<void *> input_addr_list_;
@@ -54,12 +57,13 @@ class SingleOp {

std::vector<OpTask *> tasks_;
std::vector<std::vector<uintptr_t *>> arg_table_;
std::unique_ptr<SingleOpModelParam> running_param_;
};

class DynamicSingleOp {
public:
DynamicSingleOp(uintptr_t resource_id, std::mutex *stream_mutex_, rtStream_t stream);
~DynamicSingleOp();
~DynamicSingleOp() = default;
Status ExecuteAsync(const vector<GeTensorDesc> &input_desc,
const std::vector<DataBuffer> &inputs,
std::vector<GeTensorDesc> &output_desc,
@@ -72,14 +76,6 @@ class DynamicSingleOp {
std::vector<GeTensorDesc> &output_desc,
std::vector<DataBuffer> &outputs) const;

Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes,
std::vector<void *> &workspaces);

Status ExecuteTbeTask(const vector<GeTensorDesc> &input_desc,
const vector<void *> &inputs,
vector<GeTensorDesc> &output_desc,
vector<void *> &outputs);

std::unique_ptr<OpTask> op_task_;
uintptr_t resource_id_ = 0;
std::mutex *stream_mutex_;


+ 20
- 10
ge/single_op/single_op_model.cc View File

@@ -92,7 +92,8 @@ Status SingleOpModel::InitModelMem(StreamResource &res) {
if (model_params_.memory_size > model_params_.zero_copy_mem_size) {
const string purpose("malloc feature map memory on model execute.");
GELOGI("total memory: %lu, zero_copy_mem: %lu", model_params_.memory_size, model_params_.zero_copy_mem_size);
model_params_.mem_base = res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size);
model_params_.mem_base =
res.MallocMemory(purpose, model_params_.memory_size - model_params_.zero_copy_mem_size, false);
if (model_params_.mem_base == nullptr) {
return ACL_ERROR_GE_MEMORY_ALLOCATION;
}
@@ -226,9 +227,10 @@ Status SingleOpModel::SetInputsAndOutputs(SingleOp &single_op) {
return SUCCESS;
}

Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
Status SingleOpModel::BuildTaskList(StreamResource *stream_resource, SingleOp &single_op) {
auto ge_model = model_helper_.GetGeModel();
GE_CHECK_NOTNULL(ge_model);
single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size());
auto tasks = ge_model->GetModelTaskDefPtr()->task();
for (int i = 0; i < tasks.size(); ++i) {
const TaskDef &task_def = tasks[i];
@@ -247,9 +249,11 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
return ret;
}

single_op.arg_table_.resize(single_op.input_sizes_.size() + single_op.output_sizes_.size());
ParseArgTable(tbe_task, single_op);
tbe_task->SetModelArgs(model_name_, model_id_);
if (tbe_task->tiling_buffer_ != nullptr) {
tbe_task->stream_resource_ = stream_resource;
}
single_op.tasks_.emplace_back(tbe_task);
} else if (kernel_type == ccKernelType::AI_CPU || kernel_type == ccKernelType::CUST_AI_CPU) {
GELOGD("Building AICPU_CC task");
@@ -261,6 +265,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
return ret;
}
task->SetModelArgs(model_name_, model_id_);
ParseArgTable(task, single_op);
single_op.tasks_.emplace_back(task);
} else {
GELOGE(ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID,
@@ -278,6 +283,7 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
return ret;
}
aicpu_task->SetModelArgs(model_name_, model_id_);
ParseArgTable(aicpu_task, single_op);
single_op.tasks_.emplace_back(aicpu_task);
} else {
// skip
@@ -287,21 +293,23 @@ Status SingleOpModel::BuildTaskList(SingleOp &single_op) {
return SUCCESS;
}

void SingleOpModel::ParseArgTable(TbeOpTask *task, SingleOp &op) {
void SingleOpModel::ParseArgTable(OpTask *task, SingleOp &op) {
if (task == nullptr) {
GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "tbe op task is nullptr");
return;
}

// args: addr1, addr2, addr3 ...
auto *args = const_cast<uintptr_t *>(reinterpret_cast<const uintptr_t *>(task->GetArgs()));
size_t arg_size = task->GetArgSize();
for (size_t i = 0; i < arg_size / sizeof(void *); ++i) {
uintptr_t *ptr_to_addr = args + i;
uintptr_t *arg_base = nullptr;
size_t arg_num = 0;
task->GetIoAddr(arg_base, arg_num);
for (size_t i = 0; i < arg_num; ++i) {
uintptr_t *ptr_to_addr = arg_base + i;
uintptr_t addr = *ptr_to_addr;
auto iter = model_params_.addr_mapping_.find(addr);
if (iter != model_params_.addr_mapping_.end()) {
int arg_index = iter->second;
GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetStubName().c_str(), i, arg_index);
GELOGI("%s args[%zu] mapped to user designated args[%d]", task->GetOpdesc()->GetName().c_str(), i, arg_index);
op.arg_table_[iter->second].emplace_back(ptr_to_addr);
}
}
@@ -386,8 +394,10 @@ Status SingleOpModel::BuildCpuKernelTask(const domi::KernelDef &kernel_def, OpTa
Status SingleOpModel::BuildOp(StreamResource &resource, SingleOp &single_op) {
GE_CHK_STATUS_RET_NOLOG(ParseInputsAndOutputs());
GE_CHK_STATUS_RET_NOLOG(InitModelMem(resource));
single_op.running_param_.reset(new (std::nothrow)SingleOpModelParam(model_params_));
GE_CHECK_NOTNULL(single_op.running_param_);
GE_CHK_STATUS_RET_NOLOG(SetInputsAndOutputs(single_op));
return BuildTaskList(single_op);
return BuildTaskList(&resource, single_op);
}

Status SingleOpModel::BuildModelTaskKernel(const TaskDef &task_def, DynamicSingleOp &single_op) {


+ 2
- 2
ge/single_op/single_op_model.h View File

@@ -65,7 +65,7 @@ class SingleOpModel {
Status ParseInputNode(const OpDescPtr &op_desc);
void ParseOutputNode(const OpDescPtr &op_desc);

Status BuildTaskList(SingleOp &single_op);
Status BuildTaskList(StreamResource *stream_resource, SingleOp &single_op);
Status BuildTaskListForDynamicOp(DynamicSingleOp &dynamic_single_op);
Status BuildKernelTask(const domi::KernelDef &kernel_def, TbeOpTask **task);
Status BuildKernelExTask(const domi::KernelExDef &kernel_def, AiCpuTask **task,
@@ -74,7 +74,7 @@ class SingleOpModel {
Status BuildModelTaskKernel(const domi::TaskDef &task_def, DynamicSingleOp &single_op);

static void ParseOpModelParams(ModelHelper &model_helper, SingleOpModelParam &param);
void ParseArgTable(TbeOpTask *task, SingleOp &op);
void ParseArgTable(OpTask *task, SingleOp &op);

std::string model_name_;
uint32_t model_id_ = 0;


+ 30
- 4
ge/single_op/stream_resource.cc View File

@@ -69,11 +69,25 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose,
size_t size,
size_t &max_allocated,
std::vector<uint8_t *> &allocated) {
if (size == 0) {
GELOGD("Mem size == 0");
return nullptr;
}

if (size <= max_allocated && !allocated.empty()) {
GELOGD("reuse last memory");
return allocated.back();
}

if (!allocated.empty()) {
uint8_t *current_buffer = allocated.back();
allocated.pop_back();
if (rtStreamSynchronize(stream_) != RT_ERROR_NONE) {
GELOGW("Failed to invoke rtStreamSynchronize");
}
(void) rtFree(current_buffer);
}

uint8_t *buffer = nullptr;
auto ret = rtMalloc(reinterpret_cast<void **>(&buffer), size, RT_MEMORY_HBM);
if (ret != RT_ERROR_NONE) {
@@ -96,10 +110,14 @@ uint8_t *StreamResource::DoMallocMemory(const std::string &purpose,
return buffer;
}

uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size) {
uint8_t *StreamResource::MallocMemory(const std::string &purpose, size_t size, bool holding_lock) {
GELOGD("To Malloc memory, size = %zu", size);
uint8_t *buffer = DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
return buffer;
if (holding_lock) {
return DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
} else {
std::lock_guard<std::mutex> lk(stream_mu_);
return DoMallocMemory(purpose, size, max_memory_size_, memory_list_);
}
}

uint8_t *StreamResource::MallocWeight(const std::string &purpose, size_t size) {
@@ -158,7 +176,7 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData &
return ret;
}

auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(&stream_mu_, stream_));
auto new_op = std::unique_ptr<SingleOp>(new(std::nothrow) SingleOp(this, &stream_mu_, stream_));
if (new_op == nullptr) {
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "new SingleOp failed");
return ACL_ERROR_GE_MEMORY_ALLOCATION;
@@ -171,4 +189,12 @@ Status StreamResource::BuildOperator(const string &model_name, const ModelData &
op_map_[model_data.model_data] = std::move(new_op);
return SUCCESS;
}

const uint8_t *StreamResource::GetMemoryBase() const {
if (memory_list_.empty()) {
return nullptr;
}

return memory_list_.back();
}
} // namespace ge

+ 2
- 1
ge/single_op/stream_resource.h View File

@@ -45,8 +45,9 @@ class StreamResource {
Status BuildOperator(const std::string &model_name, const ModelData &model_data, SingleOp **single_op);
Status BuildDynamicOperator(const std::string &model_name, const ModelData &model_data, DynamicSingleOp **single_op);

uint8_t *MallocMemory(const std::string &purpose, size_t size);
uint8_t *MallocMemory(const std::string &purpose, size_t size, bool holding_lock = true);
uint8_t *MallocWeight(const std::string &purpose, size_t size);
const uint8_t *GetMemoryBase() const;

private:
uint8_t *DoMallocMemory(const std::string &purpose,


+ 16
- 5
ge/single_op/task/aicpu_kernel_task_builder.cc View File

@@ -17,17 +17,22 @@
#include "single_op/task/aicpu_kernel_task_builder.h"
#include "framework/common/taskdown_common.h"
#include "graph/load/new_model_manager/model_manager.h"
#include "build_task_utils.h"

namespace ge {
AiCpuCCTaskBuilder::AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def)
: op_desc_(op_desc), kernel_def_(kernel_def) {}

Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) {
Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam &param) {
size_t aicpu_arg_size = kernel_def_.args_size();
if (aicpu_arg_size <= 0) {
if (aicpu_arg_size <= sizeof(aicpu::AicpuParamHead)) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "aicpu_arg_size is invalid, value = %zu", aicpu_arg_size);
return ACL_ERROR_GE_PARAM_INVALID;
}

task.io_addr_num_ = op_desc_->GetInputsSize() + op_desc_->GetOutputsSize();
GE_CHECK_GE(aicpu_arg_size - sizeof(aicpu::AicpuParamHead), task.io_addr_num_ * sizeof(void *));

std::unique_ptr<uint8_t[]> aicpu_args;
aicpu_args.reset(new(std::nothrow) uint8_t[aicpu_arg_size]());
if (aicpu_args == nullptr) {
@@ -41,13 +46,19 @@ Status AiCpuCCTaskBuilder::SetKernelArgs(AiCpuCCTask &task) {
return ACL_ERROR_GE_INTERNAL_ERROR;
}

task.SetIoAddr(aicpu_args.get() + sizeof(aicpu::AicpuParamHead));
task.SetIoAddr(reinterpret_cast<uintptr_t *>(aicpu_args.get() + sizeof(aicpu::AicpuParamHead)));
task.SetKernelArgs(std::move(aicpu_args), aicpu_arg_size);

auto addresses = BuildTaskUtils::GetKernelArgs(op_desc_, param);
GE_CHECK_GE(addresses.size(), task.io_addr_num_);
for (size_t i = 0; i < task.io_addr_num_; ++i) {
task.io_addr_[i] = reinterpret_cast<uintptr_t>(addresses[i]);
}
return SUCCESS;
}

Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id) {
auto ret = SetKernelArgs(task);
Status AiCpuCCTaskBuilder::BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam &param) {
auto ret = SetKernelArgs(task, param);
if (ret != SUCCESS) {
return ret;
}


+ 2
- 2
ge/single_op/task/aicpu_kernel_task_builder.h View File

@@ -30,10 +30,10 @@ class AiCpuCCTaskBuilder {
explicit AiCpuCCTaskBuilder(const OpDescPtr &op_desc, const domi::KernelDef &kernel_def);
~AiCpuCCTaskBuilder() = default;

Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id);
Status BuildTask(AiCpuCCTask &task, uint64_t kernel_id, const SingleOpModelParam &param);

private:
Status SetKernelArgs(AiCpuCCTask &task);
Status SetKernelArgs(AiCpuCCTask &task, const SingleOpModelParam &param);
const OpDescPtr op_desc_;
const domi::KernelDef &kernel_def_;
};


+ 8
- 40
ge/single_op/task/aicpu_task_builder.cc View File

@@ -26,26 +26,6 @@ namespace ge {
AiCpuTaskBuilder::AiCpuTaskBuilder(const OpDescPtr &op_desc, const domi::KernelExDef &kernel_def)
: op_desc_(op_desc), kernel_def_(kernel_def) {}

Status AiCpuTaskBuilder::SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses) {
size_t arg_size = kernel_def_.args_size();
auto rt_ret = rtMalloc(io_addr, arg_size, RT_MEMORY_HBM);
if (rt_ret != RT_ERROR_NONE) {
GELOGE(rt_ret, "rtMalloc failed, size = %zu, ret = %d", arg_size, rt_ret);
return rt_ret;
}

const void *src_addr = reinterpret_cast<const void *>(addresses.data());
uint64_t src_len = sizeof(void *) * addresses.size();
rt_ret = rtMemcpy(*io_addr, arg_size, src_addr, src_len, RT_MEMCPY_HOST_TO_DEVICE);
if (rt_ret != RT_ERROR_NONE) {
(void)rtFree(*io_addr);
GELOGE(rt_ret, "rtMemcpy addresses failed, ret = %d", rt_ret);
return rt_ret;
}

return SUCCESS;
}

Status AiCpuTaskBuilder::SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &fwk_op_kernel) {
auto sec_ret = memcpy_s(&fwk_op_kernel, sizeof(STR_FWK_OP_KERNEL),
kernel_def_.args().data(), kernel_def_.args().size());
@@ -80,39 +60,27 @@ namespace ge {
return SUCCESS;
}

Status AiCpuTaskBuilder::InitWorkspaceAndIO(void **io_addr, void **kernel_workspace,
const SingleOpModelParam &param, bool dynamic_flag) {
Status AiCpuTaskBuilder::InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag) {
if (kernel_def_.args_size() > sizeof(STR_FWK_OP_KERNEL)) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "sizeof STR_FWK_OP_KERNEL is: %lu, but args_size is: %d",
sizeof(STR_FWK_OP_KERNEL), kernel_def_.args_size());
return ACL_ERROR_GE_PARAM_INVALID;
}
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param);
auto ws_addr_vec = addresses.at(BuildTaskUtils::kAddressIndexWorkspace);

if (dynamic_flag) {
GE_CHK_RT_RET(rtMalloc(kernel_workspace, kernel_def_.task_info_size(), RT_MEMORY_HBM));
} else {
if (ws_addr_vec.empty()) {
GELOGE(ACL_ERROR_GE_PARAM_INVALID, "workspace Data Address is empty.");
return ACL_ERROR_GE_PARAM_INVALID;
}
*kernel_workspace = ws_addr_vec[0];
}
GE_CHK_RT_RET(rtMemcpy(*kernel_workspace, kernel_def_.task_info_size(),
GE_CHK_RT_RET(rtMalloc(&task.workspace_addr_, kernel_def_.task_info_size(), RT_MEMORY_HBM));
GE_CHK_RT_RET(rtMemcpy(task.workspace_addr_, kernel_def_.task_info_size(),
kernel_def_.task_info().data(), kernel_def_.task_info_size(),
RT_MEMCPY_HOST_TO_DEVICE));

auto ret = SetInputOutputAddr(io_addr, BuildTaskUtils::JoinAddresses(addresses));
if (ret != SUCCESS) {
return ret;
}
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false);
task.io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses);
task.io_addr_size_ = task.io_addr_host_.size() * sizeof(void *);
GE_CHK_RT_RET(rtMalloc(&task.io_addr_, task.io_addr_size_, RT_MEMORY_HBM));
return SUCCESS;
}

Status AiCpuTaskBuilder::BuildTask(ge::AiCpuTask &task, const SingleOpModelParam &param,
bool dynamic_flag, uint64_t kernel_id) {
GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(&task.io_addr_, &task.workspace_addr_, param, dynamic_flag));
GE_CHK_STATUS_RET_NOLOG(InitWorkspaceAndIO(task, param, dynamic_flag));

STR_FWK_OP_KERNEL fwk_op_kernel = {0};
auto ret = SetFmkOpKernel(task.io_addr_, task.workspace_addr_, fwk_op_kernel);


+ 1
- 3
ge/single_op/task/aicpu_task_builder.h View File

@@ -33,10 +33,8 @@ namespace ge {

private:
static Status SetKernelArgs(void **args, STR_FWK_OP_KERNEL &kernel);
Status SetInputOutputAddr(void **io_addr, const std::vector<void *> &addresses);
Status SetFmkOpKernel(void *io_addr, void *ws_addr, STR_FWK_OP_KERNEL &kernel);
Status InitWorkspaceAndIO(void **io_addr, void **kernel_workspace,
const SingleOpModelParam &param, bool dynamic_flag);
Status InitWorkspaceAndIO(AiCpuTask &task, const SingleOpModelParam &param, bool dynamic_flag);

const OpDescPtr op_desc_;
const domi::KernelExDef &kernel_def_;


+ 5
- 2
ge/single_op/task/build_task_utils.cc View File

@@ -32,7 +32,8 @@ const uint64_t kVarSize = 0;
}

std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &op_desc,
const SingleOpModelParam &param) {
const SingleOpModelParam &param,
bool keep_workspace) {
std::vector<std::vector<void *>> ret;
RuntimeParam runtime_para;
runtime_para.mem_size = param.memory_size;
@@ -49,7 +50,9 @@ std::vector<std::vector<void *>> BuildTaskUtils::GetAddresses(const OpDescPtr &o

ret.emplace_back(ModelUtils::GetInputDataAddrs(runtime_para, op_desc));
ret.emplace_back(ModelUtils::GetOutputDataAddrs(runtime_para, op_desc));
ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc));
if (keep_workspace) {
ret.emplace_back(ModelUtils::GetWorkspaceDataAddrs(runtime_para, op_desc));
}
return ret;
}



+ 5
- 3
ge/single_op/task/build_task_utils.h View File

@@ -27,15 +27,17 @@
namespace ge {
class BuildTaskUtils {
public:
static constexpr int kAddressIndexOutput = 1;
static constexpr int kAddressIndexWorkspace = 2;

static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc, const SingleOpModelParam &param);
static std::vector<std::vector<void *>> GetAddresses(const OpDescPtr &op_desc,
const SingleOpModelParam &param,
bool keep_workspace = true);
static std::vector<void *> JoinAddresses(const std::vector<std::vector<void *>> &addresses);
static std::vector<void *> GetKernelArgs(const OpDescPtr &op_desc, const SingleOpModelParam &param);
static std::string GetTaskInfo(const OpDescPtr &op_desc);
template<typename T>
static std::string VectorToString(const std::vector<T> &values)
{
static std::string VectorToString(const std::vector<T> &values) {
std::stringstream ss;
ss << '[';
auto size = values.size();


+ 120
- 34
ge/single_op/task/op_task.cc View File

@@ -24,9 +24,11 @@
#include "common/dump/dump_manager.h"
#include "common/dump/dump_op.h"
#include "common/formats/formats.h"
#include "common/math/math_util.h"
#include "framework/common/debug/log.h"
#include "register/op_tiling.h"
#include "runtime/rt.h"
#include "build_task_utils.h"

namespace ge {
namespace {
@@ -48,18 +50,22 @@ Status OpTask::OpenDump(rtStream_t stream) {
std::vector<uint64_t> output_adds;
auto input_size = op_desc_->GetInputsSize();
auto output_size = op_desc_->GetOutputsSize();
auto all_size = io_addrs_for_dump_.size();
if (input_size + output_size != all_size) {
GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu", all_size,
uintptr_t *arg_base = nullptr;
size_t arg_num = 0;
GetIoAddr(arg_base, arg_num);
if (arg_num < input_size + output_size) {
GELOGE(FAILED, "io_addrs_for_dump_ size %zu is not equal input and output size %zu",
arg_num,
input_size + output_size);
return FAILED;
}

for (size_t i = 0; i < input_size; i++) {
uint64_t input_addr = io_addrs_for_dump_[i];
uint64_t input_addr = arg_base[i];
input_addrs.emplace_back(input_addr);
}
for (size_t j = 0; j < output_size; j++) {
uint64_t output_addr = io_addrs_for_dump_[input_size + j];
uint64_t output_addr = arg_base[input_size + j];
output_adds.emplace_back(output_addr);
}
dump_op_.SetDumpInfo(DumpManager::GetInstance().GetDumpProperties(), op_desc_, input_addrs, output_adds, stream);
@@ -89,10 +95,6 @@ void TbeOpTask::SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size

void TbeOpTask::SetSmDesc(void *sm_desc) { sm_desc_ = sm_desc; }

const vector<int64_t> &OpTask::GetWorkspaceSizes() const { return workspace_sizes_; }

void OpTask::SetWorkspaceSizes(const vector<int64_t> &workspace_sizes) { workspace_sizes_ = workspace_sizes; }

void OpTask::SetModelArgs(std::string model_name, uint32_t model_id) {
model_name_ = model_name;
model_id_ = model_id;
@@ -107,6 +109,36 @@ Status OpTask::GetProfilingArgs(std::string &model_name, std::string &op_name, u
op_name = op_desc_->GetName();
return SUCCESS;
}
Status OpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const vector<GeTensorDesc> &output_desc) {
return UNSUPPORTED;
}
Status OpTask::UpdateArgTable(const SingleOpModelParam &param) {
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param);
auto all_addresses = BuildTaskUtils::JoinAddresses(addresses);
uintptr_t *arg_base = nullptr;
size_t arg_num = 0;
GetIoAddr(arg_base, arg_num);
if (arg_num != all_addresses.size()) {
GELOGE(INTERNAL_ERROR, "[%s] arg number mismatches, expect = %zu, but got = %zu",
op_desc_->GetName().c_str(),
arg_num,
all_addresses.size());
return INTERNAL_ERROR;
}

for (void *addr : all_addresses) {
*arg_base++ = reinterpret_cast<uintptr_t >(addr);
}
return SUCCESS;
}

Status OpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
const vector<DataBuffer> &input_buffers,
vector<GeTensorDesc> &output_desc,
vector<DataBuffer> &output_buffers,
rtStream_t stream) {
return UNSUPPORTED;
}

TbeOpTask::~TbeOpTask() {
if (sm_desc_ != nullptr) {
@@ -141,12 +173,6 @@ Status TbeOpTask::LaunchKernel(rtStream_t stream) {
return RT_FAILED;
}
GELOGI("[TASK_INFO] %s", this->stub_name_.c_str());

size_t input_size = op_desc_->GetInputsSize();
size_t output_size = op_desc_->GetOutputsSize();
uint64_t *io_addr = reinterpret_cast<uint64_t *>(args_.get());
std::vector<uint64_t> io_addrs(io_addr, io_addr + input_size + output_size);
SetIoAddrsForDump(io_addrs);
auto status = OpenDump(stream);
if (status != SUCCESS) {
GELOGE(status, "Open dump failed in the tbe single op %s", this->stub_name_.c_str());
@@ -167,11 +193,12 @@ Status TbeOpTask::UpdateRunInfo(const vector<GeTensorDesc> &input_desc, const ve
GELOGE(FAILED, "Failed to invoke OpParaCalculate. ret = %u", ret);
return FAILED;
}
SetWorkspaceSizes(run_info.workspaces);
block_dim_ = run_info.block_dim;
tiling_data_ = run_info.tiling_data.str();
GELOGD("Done invoking OpParaCalculate successfully. block_dim = %u, tiling size = %zu", block_dim_,
tiling_data_.size());

GE_CHK_STATUS_RET(AllocateWorkspaces(run_info.workspaces), "Failed to allocate workspaces");
return SUCCESS;
}

@@ -227,13 +254,54 @@ void TbeOpTask::EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, s
max_tiling_size_ = max_tiling_size;
}

Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *> &outputs,
const vector<void *> &workspaces, rtStream_t stream) {
Status TbeOpTask::AllocateWorkspaces(const vector<int64_t> &workspace_sizes) {
static const std::string kPurpose("malloc workspace memory for dynamic op.");
if (workspace_sizes.empty()) {
GELOGD("No need to allocate workspace.");
return SUCCESS;
}
int64_t total_size = 0;
std::vector<int64_t> ws_offsets;
for (auto ws_size : workspace_sizes) {
// alignment and padding should be done in OpParaCalculate
GE_CHK_STATUS_RET_NOLOG(CheckInt64AddOverflow(total_size, ws_size));
ws_offsets.emplace_back(total_size);
total_size += ws_size;
}

GELOGD("Total workspace size is %ld", total_size);
GE_CHECK_NOTNULL(stream_resource_);
auto ws_base = stream_resource_->MallocMemory(kPurpose, static_cast<size_t>(total_size));
if (ws_base == nullptr) {
GELOGE(ACL_ERROR_GE_MEMORY_ALLOCATION, "Failed to allocate memory of size: %ld", total_size);
return ACL_ERROR_GE_MEMORY_ALLOCATION;
}
GELOGD("Done allocating workspace memory successfully.");

for (auto ws_offset : ws_offsets) {
workspaces_.emplace_back(ws_base + ws_offset);
}

return SUCCESS;
}

Status TbeOpTask::LaunchKernel(const vector<GeTensorDesc> &input_desc,
const vector<DataBuffer> &input_buffers,
vector<GeTensorDesc> &output_desc,
vector<DataBuffer> &output_buffers,
rtStream_t stream) {
GE_CHK_STATUS_RET_NOLOG(UpdateRunInfo(input_desc, output_desc));
GELOGD("[%s] Start to launch kernel", node_->GetName().c_str());
std::vector<void *> args;
args.insert(args.end(), inputs.begin(), inputs.end());
args.insert(args.end(), outputs.begin(), outputs.end());
args.insert(args.end(), workspaces.begin(), workspaces.end());
for (auto &buffer : input_buffers) {
args.emplace_back(buffer.data);
}
for (auto &buffer : output_buffers) {
args.emplace_back(buffer.data);
}
for (auto &buffer : workspaces_) {
args.emplace_back(buffer);
}

if (tiling_buffer_ != nullptr) {
GELOGD("[%s] Start to copy tiling info. size = %zu", node_->GetName().c_str(), tiling_data_.size());
@@ -254,6 +322,14 @@ Status TbeOpTask::LaunchKernel(const vector<void *> &inputs, const vector<void *
return SUCCESS;
}

void TbeOpTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
arg_base = reinterpret_cast<uintptr_t *>(args_.get());
arg_count = arg_size_ / sizeof(void *);
if (tiling_buffer_ != nullptr) {
--arg_count;
}
}

AiCpuBaseTask::~AiCpuBaseTask() {
if (ext_info_addr_dev_ != nullptr) {
(void)rtFree(ext_info_addr_dev_);
@@ -399,12 +475,14 @@ AiCpuTask::~AiCpuTask() {
}
}

const void *AiCpuTask::GetIOAddr() const { return io_addr_; }

Status AiCpuTask::LaunchKernel(rtStream_t stream) {
GELOGD("Start to launch kernel. task = %s", this->op_type_.c_str());
auto ret = rtMemcpyAsync(workspace_addr_, task_info_.size(), task_info_.data(), task_info_.size(),
RT_MEMCPY_HOST_TO_DEVICE_EX, stream);
auto ret = rtMemcpyAsync(io_addr_,
io_addr_size_,
io_addr_host_.data(),
io_addr_host_.size() * sizeof(void *),
RT_MEMCPY_HOST_TO_DEVICE_EX,
stream);
if (ret != RT_ERROR_NONE) {
GELOGE(RT_FAILED, "rtMemcpyAsync workspace data failed. ret = %d, task = %s", ret, this->op_type_.c_str());
return RT_FAILED;
@@ -680,6 +758,17 @@ Status AiCpuTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
return SUCCESS;
}

Status AiCpuTask::UpdateArgTable(const SingleOpModelParam &param) {
auto addresses = BuildTaskUtils::GetAddresses(op_desc_, param, false);
io_addr_host_ = BuildTaskUtils::JoinAddresses(addresses);
return SUCCESS;
}

void AiCpuTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
arg_base = reinterpret_cast<uintptr_t *>(io_addr_host_.data());
arg_count = io_addr_host_.size();
}

void AiCpuCCTask::SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size) {
args_ = std::move(args);
arg_size_ = arg_size;
@@ -691,9 +780,7 @@ void AiCpuCCTask::SetSoName(const std::string &so_name) { so_name_ = so_name; }

void AiCpuCCTask::SetkernelName(const std::string &kernel_Name) { kernel_name_ = kernel_Name; }

void AiCpuCCTask::SetIoAddr(void *io_addr) { io_addr_ = io_addr; }

const void *AiCpuCCTask::GetIOAddr() const { return io_addr_; }
void AiCpuCCTask::SetIoAddr(uintptr_t *io_addr) { io_addr_ = io_addr; }

const void *AiCpuCCTask::GetArgs() const { return args_.get(); }

@@ -716,12 +803,6 @@ Status AiCpuCCTask::LaunchKernel(rtStream_t stream) {
return ret;
}
GELOGD("Invoke rtCpuKernelLaunch succeeded");

size_t input_size = op_desc_->GetInputsSize();
size_t output_size = op_desc_->GetOutputsSize();
uint64_t *io_addr = reinterpret_cast<uint64_t *>(io_addr_);
std::vector<uint64_t> io_addrs (io_addr, io_addr + input_size + output_size);
SetIoAddrsForDump(io_addrs);
auto status = OpenDump(stream);
if (status != SUCCESS) {
GELOGE(status, "Open dump failed in the aicpucc single op %s", this->kernel_name_.c_str());
@@ -761,4 +842,9 @@ Status AiCpuCCTask::LaunchKernel(const std::vector<GeTensorDesc> &input_desc,

return SUCCESS;
}

void AiCpuCCTask::GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) {
arg_base = io_addr_;
arg_count = io_addr_num_;
}
} // namespace ge

+ 35
- 57
ge/single_op/task/op_task.h View File

@@ -32,49 +32,27 @@
#include "init/gelib.h"

namespace ge {
enum OpTaskType {
OP_TASK_TBE = 0,
OP_TASK_AICPU,
OP_TASK_AICPUCC,
OP_TASK_INVALID,
};

class StreamResource;
struct SingleOpModelParam;
class OpTask {
public:
OpTask() = default;
virtual ~OpTask() = default;
virtual Status LaunchKernel(rtStream_t stream) = 0;
virtual Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc,
const vector<GeTensorDesc> &output_desc) {
return UNSUPPORTED;
}
virtual Status LaunchKernel(const std::vector<void *> &inputs,
const std::vector<void *> &outputs,
const std::vector<void *> &workspaces,
rtStream_t stream) {
return UNSUPPORTED;
}
virtual OpTaskType GetOpTaskType() = 0;
virtual const void *GetIOAddr() const = 0;
const vector<int64_t> &GetWorkspaceSizes() const;
void SetWorkspaceSizes(const vector<int64_t> &workspace_sizes);
const vector<GeTensorDesc> &output_desc);
virtual Status UpdateArgTable(const SingleOpModelParam &param);
void SetModelArgs(std::string model_name, uint32_t model_id);
Status GetProfilingArgs(std::string &model_name, std::string &op_name, uint32_t &model_id, uint32_t &block_dim);
const OpDescPtr &GetOpdesc() const {return op_desc_;}
Status OpenDump(rtStream_t stream);
void SetIoAddrsForDump(const vector<uint64_t> &io_addrs_for_dump) {
io_addrs_for_dump_ = io_addrs_for_dump;
}
virtual void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) = 0;
virtual Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
const std::vector<DataBuffer> &input_buffers,
std::vector<GeTensorDesc> &output_desc,
std::vector<DataBuffer> &output_buffers,
rtStream_t stream) {
return UNSUPPORTED;
}
rtStream_t stream);

private:
std::vector<int64_t> workspace_sizes_;
protected:
DumpProperties dump_properties_;
DumpOp dump_op_;
@@ -82,19 +60,18 @@ class OpTask {
std::string model_name_;
uint32_t model_id_ = 0;
uint32_t block_dim_ = 1;
std::vector<uint64_t> io_addrs_for_dump_;
};

class TbeOpTask : public OpTask {
public:
~TbeOpTask() override;
Status LaunchKernel(rtStream_t stream) override;
OpTaskType GetOpTaskType() override {
return OP_TASK_TBE;
}
const void *GetIOAddr() const override {
return nullptr;
}
Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
const std::vector<DataBuffer> &input_buffers,
std::vector<GeTensorDesc> &output_desc,
std::vector<DataBuffer> &output_buffers,
rtStream_t stream) override;
void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
void SetSmDesc(void *sm_desc);
void SetStubFunc(const std::string &name, const void *stub_func);
void SetKernelArgs(std::unique_ptr<uint8_t[]> &&args, size_t arg_size, uint32_t block_dim, const OpDescPtr &op_desc);
@@ -102,20 +79,17 @@ class TbeOpTask : public OpTask {
Status UpdateRunInfo(const vector<GeTensorDesc> &input_desc,
const vector<GeTensorDesc> &output_desc) override;

Status LaunchKernel(const vector<void *> &inputs,
const vector<void *> &outputs,
const vector<void *> &workspaces,
rtStream_t stream) override;

const void *GetArgs() const;
size_t GetArgSize() const;
const std::string &GetStubName() const;
void EnableDynamicSupport(const NodePtr &node, void *tiling_buffer, size_t max_tiling_size);

private:
friend class SingleOpModel;
static Status UpdateTensorDesc(const GeTensorDesc &src_tensor, GeTensorDesc &dst_tensor);
Status UpdateNodeByShape(const vector<GeTensorDesc> &input_desc,
const vector<GeTensorDesc> &output_desc);
Status AllocateWorkspaces(const std::vector<int64_t> &workspace_sizes);

const void *stub_func_ = nullptr;
std::unique_ptr<uint8_t[]> args_;
@@ -123,9 +97,11 @@ class TbeOpTask : public OpTask {
void *sm_desc_ = nullptr;
std::string stub_name_;

StreamResource *stream_resource_ = nullptr;
void *tiling_buffer_ = nullptr;
uint32_t max_tiling_size_ = 0;
std::string tiling_data_;
std::vector<void *> workspaces_;
NodePtr node_;
};

@@ -133,7 +109,7 @@ class AiCpuBaseTask : public OpTask {
public:
AiCpuBaseTask() = default;
~AiCpuBaseTask() override;
const UnknowShapeOpType GetUnknownType() const { return unknown_type_; }
UnknowShapeOpType GetUnknownType() const { return unknown_type_; }

protected:
Status SetExtInfoAndType(const std::string &kernel_ext_info, uint64_t kernel_id);
@@ -158,10 +134,8 @@ class AiCpuTask : public AiCpuBaseTask {
~AiCpuTask() override;

Status LaunchKernel(rtStream_t stream) override;
OpTaskType GetOpTaskType() override {
return OP_TASK_AICPU;
}
const void *GetIOAddr() const override;
Status UpdateArgTable(const SingleOpModelParam &param) override;
void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;

Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
const std::vector<DataBuffer> &input_buffers,
@@ -188,27 +162,31 @@ class AiCpuTask : public AiCpuBaseTask {
friend class AiCpuTaskBuilder;
void *workspace_addr_ = nullptr;
std::string task_info_;
// device addr
// device addr
void *args_ = nullptr;
size_t arg_size_ = 0;
std::string op_type_;
// device addr
void *io_addr_ = nullptr;
size_t io_addr_size_ = 0;

// host addr
std::vector<void *> io_addr_host_;

bool dynamic_flag_ = false;
// for copy task
void *copy_task_args_buf_;
void *copy_workspace_buf_;
void *copy_task_args_buf_ = nullptr;
void *copy_workspace_buf_ = nullptr;

std::vector<void *> output_summary_;
std::vector<aicpu::FWKAdapter::ResultSummary> output_summary_host_;

void *copy_ioaddr_dev_;
void *copy_ioaddr_dev_ = nullptr;

void *copy_input_release_flag_dev_;
void *copy_input_data_size_dev_;
void *copy_input_src_dev_;
void *copy_input_dst_dev_;
void *copy_input_release_flag_dev_ = nullptr;
void *copy_input_data_size_dev_ = nullptr;
void *copy_input_src_dev_ = nullptr;
void *copy_input_dst_dev_ = nullptr;

vector<void *> out_shape_hbm_;
uint64_t kernel_id_ = 0;
@@ -222,13 +200,12 @@ class AiCpuCCTask : public AiCpuBaseTask {
AiCpuCCTask &operator=(const AiCpuCCTask &) = delete;

Status LaunchKernel(rtStream_t stream) override;
OpTaskType GetOpTaskType() override { return OP_TASK_AICPUCC; }
const void *GetIOAddr() const override;
void GetIoAddr(uintptr_t *&arg_base, size_t &arg_count) override;
const void *GetArgs() const;
void SetKernelArgs(std::unique_ptr<uint8_t[]> args, size_t arg_size);
void SetSoName(const std::string &so_name);
void SetkernelName(const std::string &kernel_Name);
void SetIoAddr(void *io_addr);
void SetIoAddr(uintptr_t *io_addr);
size_t GetArgSize() const;

Status LaunchKernel(const std::vector<GeTensorDesc> &input_desc,
@@ -244,7 +221,8 @@ private:
std::unique_ptr<uint8_t[]> args_;
size_t arg_size_ = 0;
void *sm_desc_ = nullptr;
void *io_addr_ = nullptr;
uintptr_t *io_addr_ = nullptr;
size_t io_addr_num_ = 0;
bool is_custom_ = false;
uint32_t dump_flag_ = RT_KERNEL_DEFAULT;
};


Loading…
Cancel
Save