You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

hccl_task.cc 10 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "ge_runtime/task/hccl_task.h"
  17. #include <algorithm>
  18. #include "ge_runtime/task/task_factory.h"
  19. #include "common/opskernel/ops_kernel_info_store.h"
  20. #include "common/opskernel/ge_task_info.h"
  21. namespace ge {
  22. namespace model_runner {
  23. std::map<rtModel_t, std::map<uint32_t, std::vector<std::weak_ptr<HcclTask::StreamGuard>>>>
  24. HcclTask::model_stream_mapping_;
  25. std::mutex HcclTask::model_stream_mapping_mutex_;
  26. HcclTask::HcclTask(const ModelContext &model_context, const std::shared_ptr<HcclTaskInfo> &task_info)
  27. : TaskRepeater<HcclTaskInfo>(model_context, task_info),
  28. task_info_(task_info),
  29. stream_(nullptr),
  30. workspace_mem_(nullptr),
  31. rt_model_handle_(nullptr),
  32. priority_(0),
  33. secondary_stream_list_() {
  34. if (task_info_ == nullptr) {
  35. GELOGW("task_info_ is null!");
  36. }
  37. priority_ = model_context.priority();
  38. rt_model_handle_ = model_context.rt_model_handle();
  39. auto stream_list = model_context.stream_list();
  40. if (stream_list.size() == 1) {
  41. stream_ = stream_list[0];
  42. } else if (stream_list.size() > task_info->stream_id()) {
  43. stream_ = stream_list[task_info->stream_id()];
  44. } else {
  45. GELOGW("Index: %u >= stream_list.size(): %zu.", task_info->stream_id(), stream_list.size());
  46. }
  47. }
  48. HcclTask::~HcclTask() {
  49. if (workspace_mem_ != nullptr) {
  50. rtError_t rt_ret = rtFree(workspace_mem_);
  51. if (rt_ret != RT_ERROR_NONE) {
  52. GELOGE(RT_FAILED, "rtFree workspace_mem_ failed! ret: 0x%X.", rt_ret);
  53. }
  54. workspace_mem_ = nullptr;
  55. }
  56. }
  57. bool HcclTask::Distribute() {
  58. // Ops kernel info store
  59. // Get privateDef and opsKernelStorePtr
  60. GELOGI("Get custom info in modelTaskDef");
  61. void *ops_kernel_store = task_info_->ops_kernel_store();
  62. OpsKernelInfoStore *ops_kernel_info_store = reinterpret_cast<OpsKernelInfoStore *>(ops_kernel_store);
  63. if (ops_kernel_store == nullptr) {
  64. GELOGE(PARAM_INVALID, "No hcom distribute function ptr and no ops kernel store.");
  65. return false;
  66. }
  67. char *private_def = reinterpret_cast<char *>(const_cast<char unsigned *>(task_info_->private_def().data()));
  68. auto private_def_len = static_cast<uint32_t>(task_info_->private_def().size());
  69. GELOGI("The first address of the custom info, privateDef=%p", private_def);
  70. SetSecondaryStream();
  71. if (task_info_->workspace_size() > 0) {
  72. rtError_t rt_ret = rtMalloc(&workspace_mem_, task_info_->workspace_size(), RT_MEMORYINFO_HBM);
  73. if (rt_ret != RT_ERROR_NONE) {
  74. GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
  75. return false;
  76. }
  77. }
  78. GELOGI("HcclTaskInfo Distribute Start. begin to call function LoadTask in hccl.");
  79. GETaskInfo ge_task;
  80. ge_task.id = 0;
  81. ge_task.type = static_cast<uint16_t>(RT_MODEL_TASK_HCCL);
  82. ge_task.stream = stream_;
  83. ge_task.kernelHcclInfo = std::vector<GETaskKernelHcclInfo>(1);
  84. ge_task.kernelHcclInfo[0].hccl_type = task_info_->hccl_type();
  85. ge_task.kernelHcclInfo[0].inputDataAddr = task_info_->input_data_addr();
  86. ge_task.kernelHcclInfo[0].outputDataAddr = task_info_->output_data_addr();
  87. ge_task.kernelHcclInfo[0].workSpaceAddr = workspace_mem_;
  88. ge_task.kernelHcclInfo[0].workSpaceMemSize = task_info_->workspace_size();
  89. ge_task.kernelHcclInfo[0].count = task_info_->count();
  90. ge_task.kernelHcclInfo[0].dataType = static_cast<int32_t>(task_info_->data_type());
  91. ge_task.kernelHcclInfo[0].opType = static_cast<int32_t>(task_info_->op_type());
  92. ge_task.kernelHcclInfo[0].rootId = task_info_->root_id();
  93. std::vector<rtStream_t> secondary_stream_list;
  94. std::transform(secondary_stream_list_.begin(), secondary_stream_list_.end(),
  95. std::back_inserter(secondary_stream_list),
  96. [](const std::shared_ptr<StreamGuard> &stream) -> rtStream_t { return stream->GetStream(); });
  97. ge_task.kernelHcclInfo[0].hcclStreamList = secondary_stream_list;
  98. ge_task.privateDef = private_def;
  99. ge_task.privateDefLen = private_def_len;
  100. ge_task.opsKernelStorePtr = ops_kernel_store;
  101. auto result = ops_kernel_info_store->LoadTask(ge_task);
  102. // tagHcclResult::HCCL_SUCCESS is 0
  103. if (result != 0) {
  104. GELOGE(INTERNAL_ERROR, "davinci_model : load task fail, return ret: %u", result);
  105. return false;
  106. }
  107. GELOGI("Call function LoadTask end.");
  108. return true;
  109. }
  110. bool HcclTask::SetSecondaryStream() {
  111. const uint32_t master_stream_id = task_info_->stream_id();
  112. const int64_t hccl_secondary_stream_num = task_info_->hccl_stream_num();
  113. Status ret;
  114. std::lock_guard<std::mutex> lock(model_stream_mapping_mutex_);
  115. if (model_stream_mapping_.find(rt_model_handle_) == model_stream_mapping_.end()) {
  116. GELOGI("Need to create map for rt_model_handle_:%p with new mainstream %ld.", rt_model_handle_, master_stream_id);
  117. ret = CreateStream(hccl_secondary_stream_num, master_stream_id);
  118. if (!ret) {
  119. GELOGE(RT_FAILED, "Create hccl stream failed.");
  120. return false;
  121. }
  122. return true;
  123. }
  124. std::map<uint32_t, std::vector<std::weak_ptr<StreamGuard>>> &master_secondary_stream_map =
  125. model_stream_mapping_.at(rt_model_handle_);
  126. if (auto iter = master_secondary_stream_map.find(master_stream_id); iter != master_secondary_stream_map.end()) {
  127. std::vector<std::weak_ptr<StreamGuard>> &secondary_stream_vec = iter->second;
  128. auto lock_weak_ptr = [&secondary_stream_vec, this](int64_t index) -> bool {
  129. auto stream = secondary_stream_vec[index].lock();
  130. if (stream == nullptr) {
  131. rtStream_t new_stream = nullptr;
  132. bool ret = CreateStream(rt_model_handle_, &new_stream);
  133. if (!ret) {
  134. GELOGE(FAILED, "CreateStream failed.");
  135. return false;
  136. }
  137. stream = std::make_shared<HcclTask::StreamGuard>(rt_model_handle_, new_stream);
  138. if (stream == nullptr) {
  139. GELOGE(FAILED, "MakeShared failed.");
  140. return false;
  141. }
  142. secondary_stream_vec[index] = stream;
  143. }
  144. secondary_stream_list_.push_back(stream);
  145. return true;
  146. };
  147. if (static_cast<size_t>(hccl_secondary_stream_num) <= secondary_stream_vec.size()) {
  148. GELOGI("Number of secondary stream is enough to be reused.");
  149. for (int64_t i = 0; i < hccl_secondary_stream_num; ++i) {
  150. if (!lock_weak_ptr(i)) {
  151. GELOGE(FAILED, "Lock weak ptr failed.");
  152. return false;
  153. }
  154. }
  155. } else {
  156. GELOGI("Need to reuse secondary stream and create new secondary stream.");
  157. size_t created_stream_num = secondary_stream_vec.size();
  158. for (size_t i = 0; i < secondary_stream_vec.size(); ++i) {
  159. if (!lock_weak_ptr(i)) {
  160. GELOGE(FAILED, "Lock weak ptr failed.");
  161. return false;
  162. }
  163. }
  164. ret = CreateStream(hccl_secondary_stream_num - created_stream_num, master_stream_id);
  165. if (ret != SUCCESS) {
  166. GELOGE(RT_FAILED, "Create hccl stream failed.");
  167. return false;
  168. }
  169. }
  170. GELOGI("Initialize hccl secondary stream success, hccl_secondary_stream_num =%ld", hccl_secondary_stream_num);
  171. } else {
  172. GELOGI("Need to create secondary stream for %s with new mainstream %ld.", task_info_->op_name().c_str(),
  173. master_stream_id);
  174. ret = CreateStream(hccl_secondary_stream_num, master_stream_id);
  175. if (!ret) {
  176. GELOGE(RT_FAILED, "Create hccl stream failed.");
  177. return false;
  178. }
  179. }
  180. return true;
  181. }
  182. bool HcclTask::CreateStream(int64_t stream_num, int64_t master_stream_id) {
  183. GELOGI("Start to create %ld hccl secondary stream.", stream_num);
  184. for (int64_t i = 0; i < stream_num; ++i) {
  185. rtStream_t stream = nullptr;
  186. bool ret = CreateStream(rt_model_handle_, &stream);
  187. if (!ret) {
  188. GELOGE(FAILED, "CreateStream failed.");
  189. return false;
  190. }
  191. GELOGD("hccl_stream addr is=%p", stream);
  192. auto shared_stream = std::make_shared<StreamGuard>(rt_model_handle_, stream);
  193. if (shared_stream == nullptr) {
  194. GELOGE(FAILED, "MakeShared failed.");
  195. return false;
  196. }
  197. SaveHcclSecondaryStream(master_stream_id, shared_stream);
  198. secondary_stream_list_.push_back(shared_stream);
  199. }
  200. GELOGI("CreateStream success.");
  201. return true;
  202. }
  203. bool HcclTask::CreateStream(rtModel_t model, rtStream_t *stream) const {
  204. if (stream == nullptr) {
  205. GELOGE(FAILED, "Output param stream is null.");
  206. return false;
  207. }
  208. rtError_t rt_ret = rtStreamCreateWithFlags(stream, priority_, RT_STREAM_PERSISTENT | RT_STREAM_FORCE_COPY);
  209. if (rt_ret != RT_ERROR_NONE) {
  210. GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
  211. return false;
  212. }
  213. // Create secondary stream, inactive by default, activated by hccl
  214. rt_ret = rtModelBindStream(model, *stream, RT_MODEL_WAIT_ACTIVE_STREAM);
  215. if (rt_ret != RT_ERROR_NONE) {
  216. GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
  217. return false;
  218. }
  219. return true;
  220. }
  221. void HcclTask::SaveHcclSecondaryStream(int64_t master_stream_id, const std::shared_ptr<StreamGuard> &stream) {
  222. if (model_stream_mapping_.find(rt_model_handle_) == model_stream_mapping_.end()) {
  223. model_stream_mapping_.emplace(rt_model_handle_, std::map<uint32_t, std::vector<std::weak_ptr<StreamGuard>>>());
  224. }
  225. std::map<uint32_t, std::vector<std::weak_ptr<StreamGuard>>> &master_secondary_stream_map =
  226. model_stream_mapping_.at(rt_model_handle_);
  227. master_secondary_stream_map[master_stream_id].emplace_back(stream);
  228. }
  229. HcclTask::StreamGuard::~StreamGuard() {
  230. rtError_t rt_ret = rtModelUnbindStream(model_, stream_);
  231. if (rt_ret != RT_ERROR_NONE) {
  232. GELOGE(RT_FAILED, "Unbind stream from model failed!");
  233. return;
  234. }
  235. rt_ret = rtStreamDestroy(stream_);
  236. if (rt_ret != RT_ERROR_NONE) {
  237. GELOGE(RT_FAILED, "Destroy stream failed!");
  238. return;
  239. }
  240. }
  241. REGISTER_TASK(TaskInfoType::HCCL, HcclTask, HcclTaskInfo);
  242. } // namespace model_runner
  243. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示