You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

hccl_task.cc 10 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "ge_runtime/task/hccl_task.h"
  17. #include <algorithm>
  18. #include "ge_runtime/task/task_factory.h"
  19. #include "common/opskernel/ops_kernel_info_store.h"
  20. #include "common/opskernel/ge_task_info.h"
  21. namespace ge {
  22. namespace model_runner {
  23. std::map<rtModel_t, std::map<uint32_t, std::vector<std::weak_ptr<HcclTask::StreamGuard>>>>
  24. HcclTask::model_stream_mapping_;
  25. std::mutex HcclTask::model_stream_mapping_mutex_;
  26. HcclTask::HcclTask(const ModelContext &model_context, const std::shared_ptr<HcclTaskInfo> &task_info)
  27. : TaskRepeater<HcclTaskInfo>(model_context, task_info),
  28. task_info_(task_info),
  29. stream_(nullptr),
  30. workspace_mem_(nullptr),
  31. rt_model_handle_(nullptr),
  32. priority_(0),
  33. secondary_stream_list_() {
  34. if (task_info_ == nullptr) {
  35. GELOGW("task_info_ is null!");
  36. return;
  37. }
  38. priority_ = model_context.priority();
  39. rt_model_handle_ = model_context.rt_model_handle();
  40. auto stream_list = model_context.stream_list();
  41. if (stream_list.size() == 1) {
  42. stream_ = stream_list[0];
  43. } else if (stream_list.size() > task_info->stream_id()) {
  44. stream_ = stream_list[task_info->stream_id()];
  45. } else {
  46. GELOGW("Index: %u >= stream_list.size(): %zu.", task_info->stream_id(), stream_list.size());
  47. }
  48. }
  49. HcclTask::~HcclTask() {
  50. if (workspace_mem_ != nullptr) {
  51. rtError_t rt_ret = rtFree(workspace_mem_);
  52. if (rt_ret != RT_ERROR_NONE) {
  53. GELOGE(RT_FAILED, "rtFree workspace_mem_ failed! ret: 0x%X.", rt_ret);
  54. }
  55. workspace_mem_ = nullptr;
  56. }
  57. }
  58. bool HcclTask::Distribute() {
  59. // Ops kernel info store
  60. // Get privateDef and opsKernelStorePtr
  61. GELOGI("Get custom info in modelTaskDef");
  62. void *ops_kernel_store = task_info_->ops_kernel_store();
  63. OpsKernelInfoStore *ops_kernel_info_store = reinterpret_cast<OpsKernelInfoStore *>(ops_kernel_store);
  64. if (ops_kernel_store == nullptr) {
  65. GELOGE(PARAM_INVALID, "No hcom distribute function ptr and no ops kernel store.");
  66. return false;
  67. }
  68. char *private_def = reinterpret_cast<char *>(const_cast<char unsigned *>(task_info_->private_def().data()));
  69. auto private_def_len = static_cast<uint32_t>(task_info_->private_def().size());
  70. GELOGI("The first address of the custom info, privateDef=%p", private_def);
  71. SetSecondaryStream();
  72. if (task_info_->workspace_size() > 0) {
  73. rtError_t rt_ret = rtMalloc(&workspace_mem_, task_info_->workspace_size(), RT_MEMORYINFO_HBM);
  74. if (rt_ret != RT_ERROR_NONE) {
  75. GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
  76. return false;
  77. }
  78. }
  79. GELOGI("HcclTaskInfo Distribute Start. begin to call function LoadTask in hccl.");
  80. GETaskInfo ge_task;
  81. ge_task.id = 0;
  82. ge_task.type = static_cast<uint16_t>(RT_MODEL_TASK_HCCL);
  83. ge_task.stream = stream_;
  84. ge_task.kernelHcclInfo = std::vector<GETaskKernelHcclInfo>(1);
  85. ge_task.kernelHcclInfo[0].hccl_type = task_info_->hccl_type();
  86. ge_task.kernelHcclInfo[0].inputDataAddr = task_info_->input_data_addr();
  87. ge_task.kernelHcclInfo[0].outputDataAddr = task_info_->output_data_addr();
  88. ge_task.kernelHcclInfo[0].workSpaceAddr = workspace_mem_;
  89. ge_task.kernelHcclInfo[0].workSpaceMemSize = task_info_->workspace_size();
  90. ge_task.kernelHcclInfo[0].count = task_info_->count();
  91. ge_task.kernelHcclInfo[0].dataType = static_cast<int32_t>(task_info_->data_type());
  92. ge_task.kernelHcclInfo[0].opType = static_cast<int32_t>(task_info_->op_type());
  93. ge_task.kernelHcclInfo[0].rootId = task_info_->root_id();
  94. std::vector<rtStream_t> secondary_stream_list;
  95. std::transform(secondary_stream_list_.begin(), secondary_stream_list_.end(),
  96. std::back_inserter(secondary_stream_list),
  97. [](const std::shared_ptr<StreamGuard> &stream) -> rtStream_t { return stream->GetStream(); });
  98. ge_task.kernelHcclInfo[0].hcclStreamList = secondary_stream_list;
  99. ge_task.privateDef = private_def;
  100. ge_task.privateDefLen = private_def_len;
  101. ge_task.opsKernelStorePtr = ops_kernel_store;
  102. auto result = ops_kernel_info_store->LoadTask(ge_task);
  103. // tagHcclResult::HCCL_SUCCESS is 0
  104. if (result != 0) {
  105. GELOGE(INTERNAL_ERROR, "davinci_model : load task fail, return ret: %u", result);
  106. return false;
  107. }
  108. GELOGI("Call function LoadTask end.");
  109. return true;
  110. }
  111. bool HcclTask::SetSecondaryStream() {
  112. const uint32_t master_stream_id = task_info_->stream_id();
  113. const int64_t hccl_secondary_stream_num = task_info_->hccl_stream_num();
  114. Status ret;
  115. std::lock_guard<std::mutex> lock(model_stream_mapping_mutex_);
  116. if (model_stream_mapping_.find(rt_model_handle_) == model_stream_mapping_.end()) {
  117. GELOGI("Need to create map for rt_model_handle_:%p with new mainstream %ld.", rt_model_handle_, master_stream_id);
  118. ret = CreateStream(hccl_secondary_stream_num, master_stream_id);
  119. if (!ret) {
  120. GELOGE(RT_FAILED, "Create hccl stream failed.");
  121. return false;
  122. }
  123. return true;
  124. }
  125. std::map<uint32_t, std::vector<std::weak_ptr<StreamGuard>>> &master_secondary_stream_map =
  126. model_stream_mapping_.at(rt_model_handle_);
  127. auto iter = master_secondary_stream_map.find(master_stream_id);
  128. if (iter != master_secondary_stream_map.end()) {
  129. std::vector<std::weak_ptr<StreamGuard>> &secondary_stream_vec = iter->second;
  130. auto lock_weak_ptr = [&secondary_stream_vec, this](int64_t index) -> bool {
  131. auto stream = secondary_stream_vec[index].lock();
  132. if (stream == nullptr) {
  133. rtStream_t new_stream = nullptr;
  134. bool ret = CreateStream(rt_model_handle_, &new_stream);
  135. if (!ret) {
  136. GELOGE(FAILED, "CreateStream failed.");
  137. return false;
  138. }
  139. stream = std::make_shared<HcclTask::StreamGuard>(rt_model_handle_, new_stream);
  140. if (stream == nullptr) {
  141. GELOGE(FAILED, "MakeShared failed.");
  142. return false;
  143. }
  144. secondary_stream_vec[index] = stream;
  145. }
  146. secondary_stream_list_.push_back(stream);
  147. return true;
  148. };
  149. if (static_cast<size_t>(hccl_secondary_stream_num) <= secondary_stream_vec.size()) {
  150. GELOGI("Number of secondary stream is enough to be reused.");
  151. for (int64_t i = 0; i < hccl_secondary_stream_num; ++i) {
  152. if (!lock_weak_ptr(i)) {
  153. GELOGE(FAILED, "Lock weak ptr failed.");
  154. return false;
  155. }
  156. }
  157. } else {
  158. GELOGI("Need to reuse secondary stream and create new secondary stream.");
  159. size_t created_stream_num = secondary_stream_vec.size();
  160. for (size_t i = 0; i < secondary_stream_vec.size(); ++i) {
  161. if (!lock_weak_ptr(i)) {
  162. GELOGE(FAILED, "Lock weak ptr failed.");
  163. return false;
  164. }
  165. }
  166. ret = CreateStream(hccl_secondary_stream_num - created_stream_num, master_stream_id);
  167. if (ret != SUCCESS) {
  168. GELOGE(RT_FAILED, "Create hccl stream failed.");
  169. return false;
  170. }
  171. }
  172. GELOGI("Initialize hccl secondary stream success, hccl_secondary_stream_num =%ld", hccl_secondary_stream_num);
  173. } else {
  174. GELOGI("Need to create secondary stream for %s with new mainstream %ld.", task_info_->op_name().c_str(),
  175. master_stream_id);
  176. ret = CreateStream(hccl_secondary_stream_num, master_stream_id);
  177. if (!ret) {
  178. GELOGE(RT_FAILED, "Create hccl stream failed.");
  179. return false;
  180. }
  181. }
  182. return true;
  183. }
  184. bool HcclTask::CreateStream(int64_t stream_num, int64_t master_stream_id) {
  185. GELOGI("Start to create %ld hccl secondary stream.", stream_num);
  186. for (int64_t i = 0; i < stream_num; ++i) {
  187. rtStream_t stream = nullptr;
  188. bool ret = CreateStream(rt_model_handle_, &stream);
  189. if (!ret) {
  190. GELOGE(FAILED, "CreateStream failed.");
  191. return false;
  192. }
  193. GELOGD("hccl_stream addr is=%p", stream);
  194. auto shared_stream = std::make_shared<StreamGuard>(rt_model_handle_, stream);
  195. if (shared_stream == nullptr) {
  196. GELOGE(FAILED, "MakeShared failed.");
  197. return false;
  198. }
  199. SaveHcclSecondaryStream(master_stream_id, shared_stream);
  200. secondary_stream_list_.push_back(shared_stream);
  201. }
  202. GELOGI("CreateStream success.");
  203. return true;
  204. }
  205. bool HcclTask::CreateStream(rtModel_t model, rtStream_t *stream) const {
  206. if (stream == nullptr) {
  207. GELOGE(FAILED, "Output param stream is null.");
  208. return false;
  209. }
  210. rtError_t rt_ret = rtStreamCreateWithFlags(stream, priority_, RT_STREAM_PERSISTENT | RT_STREAM_FORCE_COPY);
  211. if (rt_ret != RT_ERROR_NONE) {
  212. GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
  213. return false;
  214. }
  215. // Create secondary stream, inactive by default, activated by hccl
  216. rt_ret = rtModelBindStream(model, *stream, RT_MODEL_WAIT_ACTIVE_STREAM);
  217. if (rt_ret != RT_ERROR_NONE) {
  218. GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
  219. return false;
  220. }
  221. return true;
  222. }
  223. void HcclTask::SaveHcclSecondaryStream(int64_t master_stream_id, const std::shared_ptr<StreamGuard> &stream) {
  224. if (model_stream_mapping_.find(rt_model_handle_) == model_stream_mapping_.end()) {
  225. model_stream_mapping_.emplace(rt_model_handle_, std::map<uint32_t, std::vector<std::weak_ptr<StreamGuard>>>());
  226. }
  227. std::map<uint32_t, std::vector<std::weak_ptr<StreamGuard>>> &master_secondary_stream_map =
  228. model_stream_mapping_.at(rt_model_handle_);
  229. master_secondary_stream_map[master_stream_id].emplace_back(stream);
  230. }
  231. HcclTask::StreamGuard::~StreamGuard() {
  232. rtError_t rt_ret = rtModelUnbindStream(model_, stream_);
  233. if (rt_ret != RT_ERROR_NONE) {
  234. GELOGE(RT_FAILED, "Unbind stream from model failed!");
  235. return;
  236. }
  237. rt_ret = rtStreamDestroy(stream_);
  238. if (rt_ret != RT_ERROR_NONE) {
  239. GELOGE(RT_FAILED, "Destroy stream failed!");
  240. return;
  241. }
  242. }
  243. REGISTER_TASK(TaskInfoType::HCCL, HcclTask, HcclTaskInfo);
  244. } // namespace model_runner
  245. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示