You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

node_executor.cc 11 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "hybrid/node_executor/node_executor.h"
  17. #include "framework/common/debug/log.h"
  18. #include "common/math/math_util.h"
  19. #include "graph/utils/node_utils.h"
  20. #include "init/gelib.h"
  21. #include "graph/utils/tensor_utils.h"
  22. #include "hybrid/executor/hybrid_execution_context.h"
  23. #include "hybrid/model/hybrid_model.h"
  24. #include "graph/debug/ge_attr_define.h"
  25. #include "opskernel_manager/ops_kernel_builder_manager.h"
  26. namespace ge {
  27. namespace hybrid {
  28. namespace {
  29. const char *const kEngineNameAiCore = "AIcoreEngine";
  30. const char *const kEngineNameGeLocal = "DNN_VM_GE_LOCAL_OP_STORE";
  31. const char *const kEngineNameAiCpu = "aicpu_ascend_kernel";
  32. const char *const kEngineNameAiCpuTf = "aicpu_tf_kernel";
  33. const char *const kEngineNameHccl = "ops_kernel_info_hccl";
  34. const char *const kEngineNameRts = "DNN_VM_RTS_OP_STORE";
  35. const char *const kEngineNameHostCpu = "DNN_VM_HOST_CPU_OP_STORE";
  36. }
  37. Status NodeExecutor::PrepareTask(NodeTask &task, TaskContext &context) const {
  38. GE_CHK_STATUS_RET_NOLOG(context.AllocateOutputs());
  39. GE_CHK_STATUS_RET_NOLOG(context.AllocateWorkspaces());
  40. GE_CHK_STATUS_RET_NOLOG(task.UpdateArgs(context));
  41. return SUCCESS;
  42. }
  43. Status NodeExecutor::ExecuteTask(NodeTask &task, TaskContext &context, const std::function<void()> &callback) const {
  44. HYBRID_CHK_STATUS_RET(task.ExecuteAsync(context, callback),
  45. "[Execute][Task] failed. node = %s", context.GetNodeItem().NodeName().c_str());
  46. return SUCCESS;
  47. }
  48. Status NodeExecutor::LoadTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const {
  49. return UNSUPPORTED;
  50. }
  51. Status NodeExecutor::CompileTask(const HybridModel &model, const NodePtr &node, shared_ptr<NodeTask> &task) const {
  52. return UNSUPPORTED;
  53. }
  54. Status NodeExecutorManager::EnsureInitialized() {
  55. std::lock_guard<std::mutex> lk(mu_);
  56. ++ref_count_;
  57. if (initialized_) {
  58. return SUCCESS;
  59. }
  60. engine_mapping_.emplace(kEngineNameAiCore, NodeExecutorManager::ExecutorType::AICORE);
  61. engine_mapping_.emplace(kEngineNameGeLocal, NodeExecutorManager::ExecutorType::GE_LOCAL);
  62. engine_mapping_.emplace(kEngineNameAiCpuTf, NodeExecutorManager::ExecutorType::AICPU_TF);
  63. engine_mapping_.emplace(kEngineNameAiCpu, NodeExecutorManager::ExecutorType::AICPU_TF);
  64. engine_mapping_.emplace(kEngineNameHccl, NodeExecutorManager::ExecutorType::HCCL);
  65. engine_mapping_.emplace(kEngineNameRts, NodeExecutorManager::ExecutorType::RTS);
  66. engine_mapping_.emplace(kEngineNameHostCpu, NodeExecutorManager::ExecutorType::HOST_CPU);
  67. initialized_ = true;
  68. GELOGI("Initializing NodeExecutors successfully");
  69. return SUCCESS;
  70. }
  71. NodeExecutorManager::ExecutorType NodeExecutorManager::ResolveExecutorType(Node &node) const {
  72. auto op_type = node.GetType();
  73. if (op_type == PARTITIONEDCALL) {
  74. const auto &subgraph = NodeUtils::GetSubgraph(node, 0);
  75. if (subgraph != nullptr && subgraph->GetGraphUnknownFlag()) {
  76. return ExecutorType::DYNAMIC_SUBGRAPH;
  77. }
  78. bool is_dynamic = false;
  79. (void)NodeUtils::GetNodeUnknownShapeStatus(node, is_dynamic);
  80. if (is_dynamic) {
  81. return ExecutorType::DYNAMIC_SUBGRAPH;
  82. }
  83. return ExecutorType::COMPILED_SUBGRAPH;
  84. }
  85. // rts kernel store is assigned to NetOutput
  86. if (op_type == NETOUTPUT || op_type == VARIABLE) {
  87. return ExecutorType::GE_LOCAL;
  88. }
  89. if (IsControlFlowV2Op(op_type)) {
  90. return ExecutorType::CONTROL_OP;
  91. }
  92. auto op_desc = node.GetOpDesc(); // checked before
  93. const auto &lib_name = op_desc->GetOpKernelLibName();
  94. auto it = engine_mapping_.find(lib_name);
  95. if (it == engine_mapping_.end()) {
  96. REPORT_INNER_ERROR("E19999", "Failed to get ExecutorType by lib_name:%s, node:%s",
  97. lib_name.c_str(), node.GetName().c_str());
  98. GELOGE(UNSUPPORTED, "[Find][ExecutorType]Failed to get ExecutorType by lib_name:%s, node:%s",
  99. lib_name.c_str(), node.GetName().c_str());
  100. return ExecutorType::RESERVED;
  101. }
  102. return it->second;
  103. }
  104. Status NodeExecutorManager::GetExecutor(Node &node, const NodeExecutor **executor) {
  105. auto executor_type = ResolveExecutorType(node);
  106. GELOGD("[%s] Set node executor by type: %d.", node.GetName().c_str(), static_cast<int>(executor_type));
  107. const auto it = executors_.find(executor_type);
  108. if (it == executors_.end()) {
  109. return GetOrCreateExecutor(executor_type, executor);
  110. }
  111. *executor = it->second.get();
  112. return SUCCESS;
  113. }
  114. void NodeExecutorManager::RegisterExecutorBuilder(NodeExecutorManager::ExecutorType executor_type,
  115. const std::function<NodeExecutor *()> &builder) {
  116. builders_.emplace(executor_type, builder);
  117. }
  118. Status NodeExecutorManager::CalcOpRunningParam(Node &node) const {
  119. auto op_desc = node.GetOpDesc();
  120. GE_CHECK_NOTNULL(op_desc);
  121. if (op_desc->GetType() == PARTITIONEDCALL) {
  122. GELOGD("[%s] Skipping CalcOpRunningParam for PartitionedCall.", node.GetName().c_str());
  123. return SUCCESS;
  124. }
  125. for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) {
  126. GeTensorDescPtr output_tensor = op_desc->MutableOutputDesc(static_cast<uint32_t>(i));
  127. GE_CHECK_NOTNULL(output_tensor);
  128. TensorUtils::SetSize(*(output_tensor.get()), 0);
  129. }
  130. // calc hccl output size independent, hccl ops kernel manager should GetSize for
  131. // input which is the output size of input-op, but sometimes return error
  132. // when multi-thread
  133. if (op_desc->GetOpKernelLibName() == kEngineNameHccl) {
  134. for (size_t i = 0; i < op_desc->GetOutputsSize(); ++i) {
  135. GeTensorDesc output_tensor = op_desc->GetOutputDesc(static_cast<uint32_t>(i));
  136. Format format = output_tensor.GetFormat();
  137. DataType data_type = output_tensor.GetDataType();
  138. GeShape output_shape = output_tensor.GetShape();
  139. int64_t output_mem_size = 0;
  140. GE_CHK_STATUS_RET(TensorUtils::CalcTensorMemSize(output_shape, format, data_type, output_mem_size),
  141. "[Calc][TensorMemSize] failed, node:%s.", node.GetName().c_str());
  142. GE_CHK_STATUS_RET(CheckInt64AddOverflow(output_mem_size, MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1),
  143. "[Check][Overflow][%s] Invalid output mem size: %ld",
  144. node.GetName().c_str(),
  145. output_mem_size);
  146. output_mem_size = ((output_mem_size +
  147. MEMORY_ALIGN_RATIO * MEMORY_ALIGN_SIZE - 1) / MEMORY_ALIGN_SIZE) * MEMORY_ALIGN_SIZE;
  148. TensorUtils::SetSize(output_tensor, output_mem_size);
  149. GE_CHK_STATUS_RET(op_desc->UpdateOutputDesc(static_cast<uint32_t>(i), output_tensor),
  150. "[Update][OutputDesc] failed, node:%s.", node.GetName().c_str());
  151. GELOGD("%s output desc[%zu], dim_size: %zu, mem_size: %ld.", node.GetName().c_str(), i,
  152. output_tensor.GetShape().GetDimNum(), output_mem_size);
  153. }
  154. return SUCCESS;
  155. }
  156. return OpsKernelBuilderManager::Instance().CalcOpRunningParam(node);
  157. }
  158. bool NodeExecutorManager::IsExecutorInitialized(NodeExecutorManager::ExecutorType executor_type) {
  159. std::lock_guard<std::mutex> lk(mu_);
  160. return executors_.find(executor_type) != executors_.end();
  161. }
  162. Status NodeExecutorManager::GetOrCreateExecutor(ExecutorType executor_type, const NodeExecutor **out_executor) {
  163. std::lock_guard<std::mutex> lk(mu_);
  164. const auto executor_it = executors_.find(executor_type);
  165. if (executor_it != executors_.end()) {
  166. *out_executor = executor_it->second.get();
  167. return SUCCESS;
  168. }
  169. GELOGI("Start to Initialize NodeExecutor, type = %d", static_cast<int>(executor_type));
  170. auto it = builders_.find(executor_type);
  171. if (it == builders_.end()) {
  172. REPORT_CALL_ERROR("E19999", "Create NodeExecutor failed for executor type = %d",
  173. static_cast<int>(executor_type));
  174. GELOGE(INTERNAL_ERROR, "[Create][NodeExecutor] failed for executor type = %d", static_cast<int>(executor_type));
  175. return INTERNAL_ERROR;
  176. }
  177. auto build_fn = it->second;
  178. GE_CHECK_NOTNULL(build_fn);
  179. auto executor = std::unique_ptr<NodeExecutor>(build_fn());
  180. if (executor == nullptr) {
  181. REPORT_CALL_ERROR("E19999", "Create NodeExecutor failed for executor type = %d",
  182. static_cast<int>(executor_type));
  183. GELOGE(INTERNAL_ERROR, "[Create][NodeExecutor] failed for engine type = %d", static_cast<int>(executor_type));
  184. return INTERNAL_ERROR;
  185. }
  186. GELOGD("Executor of engine type = %d was created successfully", static_cast<int>(executor_type));
  187. auto ret = executor->Initialize();
  188. if (ret != SUCCESS) {
  189. REPORT_CALL_ERROR("E19999", "Initialize NodeExecutor failed for type = %d", static_cast<int>(executor_type));
  190. GELOGE(ret, "[Initialize][NodeExecutor] failed for type = %d", static_cast<int>(executor_type));
  191. return ret;
  192. }
  193. *out_executor = executor.get();
  194. executors_.emplace(executor_type, std::move(executor));
  195. GELOGI("Initializing NodeExecutor successfully, type = %d", static_cast<int>(executor_type));
  196. return SUCCESS;
  197. }
  198. void NodeExecutorManager::FinalizeExecutors() {
  199. std::lock_guard<std::mutex> lk(mu_);
  200. if (ref_count_ <= 0) {
  201. GELOGD("No need for finalizing for not initialized.");
  202. return;
  203. }
  204. if (--ref_count_ > 0) {
  205. GELOGD("Ref count = %d, do not finalize executors.", ref_count_);
  206. return;
  207. }
  208. GELOGD("Start to invoke Finalize on executors.");
  209. for (auto &it : executors_) {
  210. it.second->Finalize();
  211. }
  212. executors_.clear();
  213. GELOGD("Done invoking Finalize successfully.");
  214. }
  215. NodeExecutorRegistrar::NodeExecutorRegistrar(NodeExecutorManager::ExecutorType executor_type,
  216. NodeExecutor *(*builder)()) {
  217. NodeExecutorManager::GetInstance().RegisterExecutorBuilder(executor_type, builder);
  218. }
  219. Status NoOpTask::UpdateArgs(TaskContext &context) {
  220. GELOGD("[%s] Skipping UpdateArgs for op with empty outputs", context.GetNodeName());
  221. return SUCCESS;
  222. }
  223. Status NoOpTask::ExecuteAsync(TaskContext &context, std::function<void()> done_callback) {
  224. GELOGD("[%s] Skipping execution for op with empty outputs", context.GetNodeName());
  225. return context.TryExecuteCallback(done_callback);
  226. }
  227. } // namespace hybrid
  228. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示