You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

hccl_task.cc 10 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "ge_runtime/task/hccl_task.h"
  17. #include <algorithm>
  18. #include "ge_runtime/task/task_factory.h"
  19. #include "common/opskernel/ops_kernel_info_store.h"
  20. #include "common/opskernel/ge_task_info.h"
  21. namespace ge {
  22. namespace model_runner {
  23. std::map<rtModel_t, std::map<uint32_t, std::vector<std::weak_ptr<HcclTask::StreamGuard>>>>
  24. HcclTask::model_stream_mapping_;
  25. std::mutex HcclTask::model_stream_mapping_mutex_;
  26. HcclTask::HcclTask(const ModelContext &model_context, const std::shared_ptr<HcclTaskInfo> &task_info)
  27. : TaskRepeater<HcclTaskInfo>(model_context, task_info),
  28. task_info_(task_info),
  29. stream_(nullptr),
  30. workspace_mem_(nullptr),
  31. rt_model_handle_(nullptr),
  32. priority_(0),
  33. secondary_stream_list_() {
  34. if (task_info_ == nullptr) {
  35. GELOGW("task_info_ is null!");
  36. }
  37. priority_ = model_context.priority();
  38. rt_model_handle_ = model_context.rt_model_handle();
  39. auto stream_list = model_context.stream_list();
  40. if (stream_list.size() == 1) {
  41. stream_ = stream_list[0];
  42. } else if (stream_list.size() > task_info->stream_id()) {
  43. stream_ = stream_list[task_info->stream_id()];
  44. } else {
  45. GELOGW("Index: %u >= stream_list.size(): %zu.", task_info->stream_id(), stream_list.size());
  46. }
  47. }
  48. HcclTask::~HcclTask() {
  49. if (workspace_mem_ != nullptr) {
  50. rtError_t rt_ret = rtFree(workspace_mem_);
  51. if (rt_ret != RT_ERROR_NONE) {
  52. GELOGE(RT_FAILED, "rtFree workspace_mem_ failed! ret: 0x%X.", rt_ret);
  53. }
  54. workspace_mem_ = nullptr;
  55. }
  56. }
  57. bool HcclTask::Distribute() {
  58. // Ops kernel info store
  59. // Get privateDef and opsKernelStorePtr
  60. GELOGI("Get custom info in modelTaskDef");
  61. void *ops_kernel_store = task_info_->ops_kernel_store();
  62. OpsKernelInfoStore *ops_kernel_info_store = reinterpret_cast<OpsKernelInfoStore *>(ops_kernel_store);
  63. if (ops_kernel_store == nullptr) {
  64. GELOGE(PARAM_INVALID, "No hcom distribute function ptr and no ops kernel store.");
  65. return false;
  66. }
  67. char *private_def = reinterpret_cast<char *>(const_cast<char unsigned *>(task_info_->private_def().data()));
  68. auto private_def_len = static_cast<uint32_t>(task_info_->private_def().size());
  69. GELOGI("The first address of the custom info, privateDef=%p", private_def);
  70. SetSecondaryStream();
  71. if (task_info_->workspace_size() > 0) {
  72. rtError_t rt_ret = rtMalloc(&workspace_mem_, task_info_->workspace_size(), RT_MEMORYINFO_HBM);
  73. if (rt_ret != RT_ERROR_NONE) {
  74. GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
  75. return false;
  76. }
  77. }
  78. GELOGI("HcclTaskInfo Distribute Start. begin to call function LoadTask in hccl.");
  79. GETaskInfo ge_task;
  80. ge_task.id = 0;
  81. ge_task.type = static_cast<uint16_t>(RT_MODEL_TASK_HCCL);
  82. ge_task.stream = stream_;
  83. ge_task.kernelHcclInfo = std::vector<GETaskKernelHcclInfo>(1);
  84. ge_task.kernelHcclInfo[0].hccl_type = task_info_->hccl_type();
  85. ge_task.kernelHcclInfo[0].inputDataAddr = task_info_->input_data_addr();
  86. ge_task.kernelHcclInfo[0].outputDataAddr = task_info_->output_data_addr();
  87. ge_task.kernelHcclInfo[0].workSpaceAddr = workspace_mem_;
  88. ge_task.kernelHcclInfo[0].workSpaceMemSize = task_info_->workspace_size();
  89. ge_task.kernelHcclInfo[0].count = task_info_->count();
  90. ge_task.kernelHcclInfo[0].dataType = static_cast<int32_t>(task_info_->data_type());
  91. ge_task.kernelHcclInfo[0].opType = static_cast<int32_t>(task_info_->op_type());
  92. ge_task.kernelHcclInfo[0].rootId = task_info_->root_id();
  93. std::vector<rtStream_t> secondary_stream_list;
  94. std::transform(secondary_stream_list_.begin(), secondary_stream_list_.end(),
  95. std::back_inserter(secondary_stream_list),
  96. [](const std::shared_ptr<StreamGuard> &stream) -> rtStream_t { return stream->GetStream(); });
  97. ge_task.kernelHcclInfo[0].hcclStreamList = secondary_stream_list;
  98. ge_task.privateDef = private_def;
  99. ge_task.privateDefLen = private_def_len;
  100. ge_task.opsKernelStorePtr = ops_kernel_store;
  101. auto result = ops_kernel_info_store->LoadTask(ge_task);
  102. // tagHcclResult::HCCL_SUCCESS is 0
  103. if (result != 0) {
  104. GELOGE(INTERNAL_ERROR, "davinci_model : load task fail, return ret: %u", result);
  105. return false;
  106. }
  107. GELOGI("Call function LoadTask end.");
  108. return true;
  109. }
  110. bool HcclTask::SetSecondaryStream() {
  111. const uint32_t master_stream_id = task_info_->stream_id();
  112. const int64_t hccl_secondary_stream_num = task_info_->hccl_stream_num();
  113. Status ret;
  114. std::lock_guard<std::mutex> lock(model_stream_mapping_mutex_);
  115. if (model_stream_mapping_.find(rt_model_handle_) == model_stream_mapping_.end()) {
  116. GELOGI("Need to create map for rt_model_handle_:%p with new mainstream %ld.", rt_model_handle_, master_stream_id);
  117. ret = CreateStream(hccl_secondary_stream_num, master_stream_id);
  118. if (!ret) {
  119. GELOGE(RT_FAILED, "Create hccl stream failed.");
  120. return false;
  121. }
  122. return true;
  123. }
  124. std::map<uint32_t, std::vector<std::weak_ptr<StreamGuard>>> &master_secondary_stream_map =
  125. model_stream_mapping_.at(rt_model_handle_);
  126. auto iter = master_secondary_stream_map.find(master_stream_id);
  127. if (iter != master_secondary_stream_map.end()) {
  128. std::vector<std::weak_ptr<StreamGuard>> &secondary_stream_vec = iter->second;
  129. auto lock_weak_ptr = [&secondary_stream_vec, this](int64_t index) -> bool {
  130. auto stream = secondary_stream_vec[index].lock();
  131. if (stream == nullptr) {
  132. rtStream_t new_stream = nullptr;
  133. bool ret = CreateStream(rt_model_handle_, &new_stream);
  134. if (!ret) {
  135. GELOGE(FAILED, "CreateStream failed.");
  136. return false;
  137. }
  138. stream = std::make_shared<HcclTask::StreamGuard>(rt_model_handle_, new_stream);
  139. if (stream == nullptr) {
  140. GELOGE(FAILED, "MakeShared failed.");
  141. return false;
  142. }
  143. secondary_stream_vec[index] = stream;
  144. }
  145. secondary_stream_list_.push_back(stream);
  146. return true;
  147. };
  148. if (static_cast<size_t>(hccl_secondary_stream_num) <= secondary_stream_vec.size()) {
  149. GELOGI("Number of secondary stream is enough to be reused.");
  150. for (int64_t i = 0; i < hccl_secondary_stream_num; ++i) {
  151. if (!lock_weak_ptr(i)) {
  152. GELOGE(FAILED, "Lock weak ptr failed.");
  153. return false;
  154. }
  155. }
  156. } else {
  157. GELOGI("Need to reuse secondary stream and create new secondary stream.");
  158. size_t created_stream_num = secondary_stream_vec.size();
  159. for (size_t i = 0; i < secondary_stream_vec.size(); ++i) {
  160. if (!lock_weak_ptr(i)) {
  161. GELOGE(FAILED, "Lock weak ptr failed.");
  162. return false;
  163. }
  164. }
  165. ret = CreateStream(hccl_secondary_stream_num - created_stream_num, master_stream_id);
  166. if (ret != SUCCESS) {
  167. GELOGE(RT_FAILED, "Create hccl stream failed.");
  168. return false;
  169. }
  170. }
  171. GELOGI("Initialize hccl secondary stream success, hccl_secondary_stream_num =%ld", hccl_secondary_stream_num);
  172. } else {
  173. GELOGI("Need to create secondary stream for %s with new mainstream %ld.", task_info_->op_name().c_str(),
  174. master_stream_id);
  175. ret = CreateStream(hccl_secondary_stream_num, master_stream_id);
  176. if (!ret) {
  177. GELOGE(RT_FAILED, "Create hccl stream failed.");
  178. return false;
  179. }
  180. }
  181. return true;
  182. }
  183. bool HcclTask::CreateStream(int64_t stream_num, int64_t master_stream_id) {
  184. GELOGI("Start to create %ld hccl secondary stream.", stream_num);
  185. for (int64_t i = 0; i < stream_num; ++i) {
  186. rtStream_t stream = nullptr;
  187. bool ret = CreateStream(rt_model_handle_, &stream);
  188. if (!ret) {
  189. GELOGE(FAILED, "CreateStream failed.");
  190. return false;
  191. }
  192. GELOGD("hccl_stream addr is=%p", stream);
  193. auto shared_stream = std::make_shared<StreamGuard>(rt_model_handle_, stream);
  194. if (shared_stream == nullptr) {
  195. GELOGE(FAILED, "MakeShared failed.");
  196. return false;
  197. }
  198. SaveHcclSecondaryStream(master_stream_id, shared_stream);
  199. secondary_stream_list_.push_back(shared_stream);
  200. }
  201. GELOGI("CreateStream success.");
  202. return true;
  203. }
  204. bool HcclTask::CreateStream(rtModel_t model, rtStream_t *stream) const {
  205. if (stream == nullptr) {
  206. GELOGE(FAILED, "Output param stream is null.");
  207. return false;
  208. }
  209. rtError_t rt_ret = rtStreamCreateWithFlags(stream, priority_, RT_STREAM_PERSISTENT | RT_STREAM_FORCE_COPY);
  210. if (rt_ret != RT_ERROR_NONE) {
  211. GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
  212. return false;
  213. }
  214. // Create secondary stream, inactive by default, activated by hccl
  215. rt_ret = rtModelBindStream(model, *stream, RT_MODEL_WAIT_ACTIVE_STREAM);
  216. if (rt_ret != RT_ERROR_NONE) {
  217. GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
  218. return false;
  219. }
  220. return true;
  221. }
  222. void HcclTask::SaveHcclSecondaryStream(int64_t master_stream_id, const std::shared_ptr<StreamGuard> &stream) {
  223. if (model_stream_mapping_.find(rt_model_handle_) == model_stream_mapping_.end()) {
  224. model_stream_mapping_.emplace(rt_model_handle_, std::map<uint32_t, std::vector<std::weak_ptr<StreamGuard>>>());
  225. }
  226. std::map<uint32_t, std::vector<std::weak_ptr<StreamGuard>>> &master_secondary_stream_map =
  227. model_stream_mapping_.at(rt_model_handle_);
  228. master_secondary_stream_map[master_stream_id].emplace_back(stream);
  229. }
  230. HcclTask::StreamGuard::~StreamGuard() {
  231. rtError_t rt_ret = rtModelUnbindStream(model_, stream_);
  232. if (rt_ret != RT_ERROR_NONE) {
  233. GELOGE(RT_FAILED, "Unbind stream from model failed!");
  234. return;
  235. }
  236. rt_ret = rtStreamDestroy(stream_);
  237. if (rt_ret != RT_ERROR_NONE) {
  238. GELOGE(RT_FAILED, "Destroy stream failed!");
  239. return;
  240. }
  241. }
  242. REGISTER_TASK(TaskInfoType::HCCL, HcclTask, HcclTaskInfo);
  243. } // namespace model_runner
  244. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示