You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dump_op.cc 9.4 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "common/dump/dump_op.h"
  17. #include "common/dump/dump_manager.h"
  18. #include "common/ge/datatype_util.h"
  19. #include "framework/common/debug/ge_log.h"
  20. #include "framework/common/util.h"
  21. #include "graph/anchor.h"
  22. #include "graph/ge_tensor.h"
  23. #include "graph/op_desc.h"
  24. #include "graph/utils/tensor_utils.h"
  25. #include "proto/ge_ir.pb.h"
  26. #include "proto/op_mapping_info.pb.h"
  27. #include "runtime/mem.h"
  28. #include "aicpu/common/aicpu_task_struct.h"
  29. namespace {
  30. const uint32_t kAicpuLoadFlag = 1;
  31. const char *const kDumpOutput = "output";
  32. const char *const kDumpInput = "input";
  33. const char *const kDumpAll = "all";
  34. const char *const kDumpKernelsDumpOp = "DumpDataInfo";
  35. } // namespace
  36. namespace ge {
  37. DumpOp::~DumpOp() {
  38. if (proto_dev_mem_ != nullptr) {
  39. (void)rtFree(proto_dev_mem_);
  40. }
  41. if (proto_size_dev_mem_ != nullptr) {
  42. (void)rtFree(proto_size_dev_mem_);
  43. }
  44. proto_dev_mem_ = nullptr;
  45. proto_size_dev_mem_ = nullptr;
  46. }
  47. void DumpOp::SetLoopAddr(void *global_step, void *loop_per_iter, void *loop_cond) {
  48. global_step_ = reinterpret_cast<uintptr_t>(global_step);
  49. loop_per_iter_ = reinterpret_cast<uintptr_t>(loop_per_iter);
  50. loop_cond_ = reinterpret_cast<uintptr_t>(loop_cond);
  51. }
  52. void DumpOp::SetDynamicModelInfo(const string &dynamic_model_name, uint32_t dynamic_model_id) {
  53. dynamic_model_name_ = dynamic_model_name;
  54. dynamic_model_id_ = dynamic_model_id;
  55. }
  56. static void SetOpMappingLoopAddr(uintptr_t step_id, uintptr_t loop_per_iter, uintptr_t loop_cond,
  57. aicpu::dump::OpMappingInfo &op_mapping_info) {
  58. if (step_id != 0) {
  59. GELOGI("step_id exists.");
  60. op_mapping_info.set_step_id_addr(static_cast<uint64_t>(step_id));
  61. } else {
  62. GELOGI("step_id is null.");
  63. }
  64. if (loop_per_iter != 0) {
  65. GELOGI("loop_per_iter exists.");
  66. op_mapping_info.set_iterations_per_loop_addr(static_cast<uint64_t>(loop_per_iter));
  67. } else {
  68. GELOGI("loop_per_iter is null.");
  69. }
  70. if (loop_cond != 0) {
  71. GELOGI("loop_cond exists.");
  72. op_mapping_info.set_loop_cond_addr(static_cast<uint64_t>(loop_cond));
  73. } else {
  74. GELOGI("loop_cond is null.");
  75. }
  76. }
  77. Status DumpOp::DumpOutput(aicpu::dump::Task &task) {
  78. GELOGI("Start dump output in Launch dump op");
  79. const auto &output_descs = op_desc_->GetAllOutputsDesc();
  80. for (size_t i = 0; i < output_descs.size(); ++i) {
  81. aicpu::dump::Output output;
  82. output.set_data_type(static_cast<int32_t>(DataTypeUtil::GetIrDataType(output_descs.at(i).GetDataType())));
  83. output.set_format(static_cast<int32_t>(output_descs.at(i).GetFormat()));
  84. for (auto dim : output_descs.at(i).GetShape().GetDims()) {
  85. output.mutable_shape()->add_dim(dim);
  86. }
  87. int64_t output_size = 0;
  88. if (TensorUtils::GetTensorSizeInBytes(output_descs.at(i), output_size) != SUCCESS) {
  89. GELOGE(PARAM_INVALID, "Get output size filed");
  90. return PARAM_INVALID;
  91. }
  92. GELOGD("Get output size in lanch dump op is %ld", output_size);
  93. output.set_size(output_size);
  94. output.set_address(static_cast<uint64_t>(output_addrs_[i]));
  95. task.mutable_output()->Add(std::move(output));
  96. }
  97. return SUCCESS;
  98. }
  99. Status DumpOp::DumpInput(aicpu::dump::Task &task) {
  100. GELOGI("Start dump input in Launch dump op");
  101. const auto &input_descs = op_desc_->GetAllInputsDesc();
  102. for (size_t i = 0; i < input_descs.size(); ++i) {
  103. aicpu::dump::Input input;
  104. input.set_data_type(static_cast<int32_t>(DataTypeUtil::GetIrDataType(input_descs.at(i).GetDataType())));
  105. input.set_format(static_cast<int32_t>(input_descs.at(i).GetFormat()));
  106. for (auto dim : input_descs.at(i).GetShape().GetDims()) {
  107. input.mutable_shape()->add_dim(dim);
  108. }
  109. int64_t input_size = 0;
  110. if (TensorUtils::GetTensorSizeInBytes(input_descs.at(i), input_size) != SUCCESS) {
  111. GELOGE(PARAM_INVALID, "Get output size filed");
  112. return PARAM_INVALID;
  113. }
  114. GELOGD("Get input size in lanch dump op is %ld", input_size);
  115. input.set_size(input_size);
  116. input.set_address(static_cast<uint64_t>(input_addrs_[i]));
  117. task.mutable_input()->Add(std::move(input));
  118. }
  119. return SUCCESS;
  120. }
  121. void DumpOp::SetDumpInfo(const DumpProperties &dump_properties, const OpDescPtr &op_desc, vector<uintptr_t> input_addrs,
  122. vector<uintptr_t> output_addrs, rtStream_t stream) {
  123. dump_properties_ = dump_properties;
  124. op_desc_ = op_desc;
  125. input_addrs_ = input_addrs;
  126. output_addrs_ = output_addrs;
  127. stream_ = stream;
  128. }
  129. Status DumpOp::ExecutorDumpOp(aicpu::dump::OpMappingInfo &op_mapping_info) {
  130. std::string proto_msg;
  131. size_t proto_size = op_mapping_info.ByteSizeLong();
  132. bool ret = op_mapping_info.SerializeToString(&proto_msg);
  133. if (!ret || proto_size == 0) {
  134. GELOGE(FAILED, "Protobuf serialize failed,proto_size is %zu", proto_size);
  135. return FAILED;
  136. }
  137. rtError_t rt_ret = rtMalloc(&proto_dev_mem_, proto_size, RT_MEMORY_HBM);
  138. if (rt_ret != RT_ERROR_NONE) {
  139. GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
  140. return RT_FAILED;
  141. }
  142. rt_ret = rtMemcpy(proto_dev_mem_, proto_size, proto_msg.c_str(), proto_size, RT_MEMCPY_HOST_TO_DEVICE);
  143. if (rt_ret != RT_ERROR_NONE) {
  144. GELOGE(RT_FAILED, "Call rtMemcpy failed, ret: 0x%X", rt_ret);
  145. return RT_FAILED;
  146. }
  147. rt_ret = rtMalloc(&proto_size_dev_mem_, sizeof(size_t), RT_MEMORY_HBM);
  148. if (rt_ret != RT_ERROR_NONE) {
  149. GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
  150. return RT_FAILED;
  151. }
  152. rt_ret = rtMemcpy(proto_size_dev_mem_, sizeof(size_t), &proto_size, sizeof(size_t), RT_MEMCPY_HOST_TO_DEVICE);
  153. if (rt_ret != RT_ERROR_NONE) {
  154. GELOGE(RT_FAILED, "Call rtMemcpy failed, ret: 0x%X", rt_ret);
  155. return RT_FAILED;
  156. }
  157. constexpr int32_t ioAddrNum = 2;
  158. constexpr uint32_t argsSize = sizeof(aicpu::AicpuParamHead) + ioAddrNum * sizeof(uint64_t);
  159. char args[argsSize] = {0};
  160. auto paramHead = reinterpret_cast<aicpu::AicpuParamHead *>(args);
  161. paramHead->length = argsSize;
  162. paramHead->ioAddrNum = ioAddrNum;
  163. auto ioAddr = reinterpret_cast<uint64_t *>(args + sizeof(aicpu::AicpuParamHead));
  164. ioAddr[0] = reinterpret_cast<uintptr_t>(proto_dev_mem_);
  165. ioAddr[1] = reinterpret_cast<uintptr_t>(proto_size_dev_mem_);
  166. rt_ret = rtCpuKernelLaunch(nullptr, kDumpKernelsDumpOp,
  167. 1, // blockDim default 1
  168. args, argsSize,
  169. nullptr, // no need smDesc
  170. stream_);
  171. if (rt_ret != RT_ERROR_NONE) {
  172. GELOGE(RT_FAILED, "Call rtCpuKernelLaunch failed,rt_ret:0x%X", rt_ret);
  173. return rt_ret;
  174. }
  175. GELOGI("Kernel launch dump op success");
  176. return SUCCESS;
  177. }
  178. Status DumpOp::LaunchDumpOp() {
  179. GELOGI("Start to launch dump op %s", op_desc_->GetName().c_str());
  180. int32_t device_id = 0;
  181. rtError_t rt_ret = rtGetDevice(&device_id);
  182. if (rt_ret != RT_ERROR_NONE || device_id < 0) {
  183. GELOGE(RT_FAILED, "Call rtGetDevice failed, ret = 0x%X, device_id = %d.", rt_ret, device_id);
  184. return RT_FAILED;
  185. }
  186. aicpu::dump::OpMappingInfo op_mapping_info;
  187. auto dump_path = dump_properties_.GetDumpPath() + std::to_string(device_id) + "/";
  188. op_mapping_info.set_dump_path(dump_path);
  189. op_mapping_info.set_flag(kAicpuLoadFlag);
  190. op_mapping_info.set_dump_step(dump_properties_.GetDumpStep());
  191. if (!dynamic_model_name_.empty()) {
  192. op_mapping_info.set_model_name(dynamic_model_name_);
  193. op_mapping_info.set_model_id(dynamic_model_id_);
  194. }
  195. SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info);
  196. GELOGI("Dump step is %s ,dump path is %s ,in Launch dump op", dump_properties_.GetDumpStep().c_str(),
  197. dump_path.c_str());
  198. aicpu::dump::Task task;
  199. task.mutable_op()->set_op_name(op_desc_->GetName());
  200. task.mutable_op()->set_op_type(op_desc_->GetType());
  201. if (dump_properties_.GetDumpMode() == kDumpOutput) {
  202. if (DumpOutput(task) != SUCCESS) {
  203. GELOGE(FAILED, "Dump output failed");
  204. return FAILED;
  205. }
  206. op_mapping_info.mutable_task()->Add(std::move(task));
  207. }
  208. if (dump_properties_.GetDumpMode() == kDumpInput) {
  209. if (DumpInput(task) != SUCCESS) {
  210. GELOGE(FAILED, "Dump input failed");
  211. return FAILED;
  212. }
  213. op_mapping_info.mutable_task()->Add(std::move(task));
  214. }
  215. if (dump_properties_.GetDumpMode() == kDumpAll) {
  216. auto ret = DumpOutput(task);
  217. if (ret != SUCCESS) {
  218. GELOGE(FAILED, "Dump output failed when in dumping all");
  219. return FAILED;
  220. }
  221. ret = DumpInput(task);
  222. if (ret != SUCCESS) {
  223. GELOGE(FAILED, "Dump input failed when in dumping all");
  224. return FAILED;
  225. }
  226. op_mapping_info.mutable_task()->Add(std::move(task));
  227. }
  228. auto ret = ExecutorDumpOp(op_mapping_info);
  229. if (ret != SUCCESS) {
  230. GELOGE(ret, "Executor dump op failed");
  231. return ret;
  232. }
  233. return SUCCESS;
  234. }
  235. } // namesapce ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示