You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

exception_dumper.cc 10 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. /**
  2. * Copyright 2019-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "common/dump/exception_dumper.h"
  17. #include "common/ge/datatype_util.h"
  18. #include "common/debug/memory_dumper.h"
  19. #include "framework/common/debug/log.h"
  20. #include "graph/manager/util/debug.h"
  21. #include "graph/utils/tensor_utils.h"
  22. #include "graph/load/model_manager/model_utils.h"
  23. #include "proto/dump_task.pb.h"
  24. namespace {
  25. static uint64_t GetNowTime() {
  26. uint64_t ret = 0;
  27. mmTimeval tv;
  28. if (mmGetTimeOfDay(&tv, nullptr) == 0) {
  29. ret = tv.tv_sec * 1000000ULL + tv.tv_usec;
  30. }
  31. return ret;
  32. }
  33. static void ReplaceStringElem(std::string &str) {
  34. for_each(str.begin(), str.end(), [](char &ch) {
  35. if ((ch == ' ') || (ch == '.') || (ch == '/') || (ch == '\\')) {
  36. ch = '_';
  37. }
  38. });
  39. }
  40. static void SetDumpData(const ge::OpDescInfo &op_desc_info, toolkit::dump::DumpData &dump_data) {
  41. dump_data.set_version("2.0");
  42. dump_data.set_dump_time(GetNowTime());
  43. dump_data.set_op_name(op_desc_info.op_name);
  44. for (size_t i = 0; i < op_desc_info.input_format.size(); ++i) {
  45. toolkit::dump::OpInput input;
  46. input.set_data_type(toolkit::dump::OutputDataType(
  47. ge::DataTypeUtil::GetIrDataType(op_desc_info.input_data_type[i])));
  48. input.set_format(toolkit::dump::OutputFormat(op_desc_info.input_format[i]));
  49. for (auto dim : op_desc_info.input_shape[i]) {
  50. input.mutable_shape()->add_dim(dim);
  51. }
  52. input.set_size(op_desc_info.input_size[i]);
  53. GELOGI("[Set][DumpData] The input size int exception is %ld", op_desc_info.input_size[i]);
  54. dump_data.mutable_input()->Add(std::move(input));
  55. }
  56. for (size_t j = 0; j < op_desc_info.output_format.size(); ++j) {
  57. toolkit::dump::OpOutput output;
  58. output.set_data_type(toolkit::dump::OutputDataType(
  59. ge::DataTypeUtil::GetIrDataType(op_desc_info.output_data_type[j])));
  60. output.set_format(toolkit::dump::OutputFormat(op_desc_info.output_format[j]));
  61. for (auto dim : op_desc_info.output_shape[j]) {
  62. output.mutable_shape()->add_dim(dim);
  63. }
  64. output.set_size(op_desc_info.output_size[j]);
  65. GELOGI("[Set][DumpData] The output size int exception is %ld", op_desc_info.output_size[j]);
  66. dump_data.mutable_output()->Add(std::move(output));
  67. }
  68. }
  69. } // namespace
  70. namespace ge {
  71. ExceptionDumper::~ExceptionDumper() {}
  72. void ExceptionDumper::SaveDumpOpInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id,
  73. vector<void *> &input_addrs, vector<void *> &output_addrs) {
  74. OpDescInfo op_desc_info;
  75. SaveOpDescInfo(op, task_id, stream_id, op_desc_info);
  76. op_desc_info.input_addrs = input_addrs;
  77. op_desc_info.output_addrs = output_addrs;
  78. op_desc_info_.emplace_back(std::move(op_desc_info));
  79. }
  80. void ExceptionDumper::SaveDumpOpInfo(const RuntimeParam &model_param, const OpDescPtr &op,
  81. uint32_t task_id, uint32_t stream_id) {
  82. OpDescInfo op_desc_info;
  83. SaveOpDescInfo(op, task_id, stream_id, op_desc_info);
  84. op_desc_info.input_addrs = ModelUtils::GetInputDataAddrs(model_param, op);
  85. op_desc_info.output_addrs = ModelUtils::GetOutputDataAddrs(model_param, op);
  86. op_desc_info_.emplace_back(std::move(op_desc_info));
  87. }
  88. void ExceptionDumper::SaveOpDescInfo(const OpDescPtr &op, uint32_t task_id, uint32_t stream_id,
  89. OpDescInfo &op_desc_info) {
  90. if (op == nullptr) {
  91. GELOGW("[Save][OpExceptionInfo] op desc ptr is null.");
  92. return;
  93. }
  94. GELOGD("[Save][OpExceptionInfo] Start to save dump op [%s] info of task_id: %u, stream_id: %u",
  95. op->GetName().c_str(), task_id, stream_id);
  96. op_desc_info.op_name = op->GetName();
  97. op_desc_info.op_type = op->GetType();
  98. op_desc_info.task_id = task_id;
  99. op_desc_info.stream_id = stream_id;
  100. for (size_t i = 0; i < op->GetAllInputsSize(); ++i) {
  101. GeTensorDescPtr input_tensor_desc = op->MutableInputDesc(i);
  102. if (input_tensor_desc == nullptr) {
  103. continue;
  104. }
  105. op_desc_info.input_format.emplace_back(input_tensor_desc->GetFormat());
  106. op_desc_info.input_shape.emplace_back(input_tensor_desc->GetShape().GetDims());
  107. op_desc_info.input_data_type.emplace_back(input_tensor_desc->GetDataType());
  108. int64_t input_size = 0;
  109. if (TensorUtils::GetTensorSizeInBytes(*input_tensor_desc, input_size) != SUCCESS) {
  110. GELOGW("[Save][OpExceptionInfo] Op [%s] get input size failed.", op->GetName().c_str());
  111. return;
  112. }
  113. GELOGD("[Save][OpExceptionInfo] Save dump op info, the input size is %ld", input_size);
  114. op_desc_info.input_size.emplace_back(input_size);
  115. }
  116. for (size_t j = 0; j < op->GetOutputsSize(); ++j) {
  117. GeTensorDescPtr output_tensor_desc = op->MutableOutputDesc(j);
  118. if (output_tensor_desc == nullptr) {
  119. continue;
  120. }
  121. op_desc_info.output_format.emplace_back(output_tensor_desc->GetFormat());
  122. op_desc_info.output_shape.emplace_back(output_tensor_desc->GetShape().GetDims());
  123. op_desc_info.output_data_type.emplace_back(output_tensor_desc->GetDataType());
  124. int64_t output_size = 0;
  125. if (TensorUtils::GetTensorSizeInBytes(*output_tensor_desc, output_size) != SUCCESS) {
  126. GELOGW("[Save][OpExceptionInfo] Op [%s] get output size failed.", op->GetName().c_str());
  127. return;
  128. }
  129. GELOGD("[Save][OpExceptionInfo] Save dump op info, the output size is %ld.", output_size);
  130. op_desc_info.output_size.emplace_back(output_size);
  131. }
  132. }
  133. Status ExceptionDumper::DumpExceptionInfo(const std::vector<rtExceptionInfo> &exception_infos) const {
  134. GELOGI("[Dump][Exception] Start to dump exception info");
  135. for (const rtExceptionInfo &iter : exception_infos) {
  136. OpDescInfo op_desc_info;
  137. if (GetOpDescInfo(iter.streamid, iter.taskid, op_desc_info)) {
  138. toolkit::dump::DumpData dump_data;
  139. SetDumpData(op_desc_info, dump_data);
  140. uint64_t now_time = GetNowTime();
  141. std::string op_name = op_desc_info.op_name;
  142. std::string op_type = op_desc_info.op_type;
  143. ReplaceStringElem(op_name);
  144. ReplaceStringElem(op_type);
  145. string dump_file_path =
  146. "./" + op_type + "." + op_name + "." + std::to_string(op_desc_info.task_id) + "." + std::to_string(now_time);
  147. GELOGI("[Dump][Exception] The exception dump file path is %s", dump_file_path.c_str());
  148. uint64_t proto_size = dump_data.ByteSizeLong();
  149. std::unique_ptr<char[]> proto_msg(new (std::nothrow) char[proto_size]);
  150. GE_CHECK_NOTNULL(proto_msg);
  151. bool ret = dump_data.SerializeToArray(proto_msg.get(), proto_size);
  152. if (!ret || proto_size == 0) {
  153. REPORT_INNER_ERROR("E19999", "Serialize proto to string fail");
  154. GELOGE(PARAM_INVALID, "[Dump][Exception] Dump data proto serialize failed");
  155. return PARAM_INVALID;
  156. }
  157. GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), &proto_size, sizeof(uint64_t)),
  158. "Failed to dump proto size");
  159. GE_CHK_STATUS_RET(MemoryDumper::DumpToFile(dump_file_path.c_str(), proto_msg.get(), proto_size),
  160. "Failed to dump proto msg");
  161. if (DumpExceptionInput(op_desc_info, dump_file_path) != SUCCESS) {
  162. GELOGE(PARAM_INVALID, "[Dump][Exception] Dump exception input failed");
  163. return PARAM_INVALID;
  164. }
  165. if (DumpExceptionOutput(op_desc_info, dump_file_path) != SUCCESS) {
  166. GELOGE(PARAM_INVALID, "[Dump][Exception] Dump exception output failed");
  167. return PARAM_INVALID;
  168. }
  169. GELOGI("[Dump][Exception] Dump exception info SUCCESS");
  170. } else {
  171. GELOGE(PARAM_INVALID, "[Dump][Exception] Get op desc info failed,task id:%u,stream id:%u",
  172. iter.taskid, iter.streamid);
  173. return PARAM_INVALID;
  174. }
  175. }
  176. return SUCCESS;
  177. }
  178. bool ExceptionDumper::GetOpDescInfo(uint32_t stream_id, uint32_t task_id, OpDescInfo &op_desc_info) const {
  179. GELOGI("[Get][OpDescInfo] There are %zu op need to dump.", op_desc_info_.size());
  180. for (size_t index = 0; index < op_desc_info_.size(); ++index) {
  181. OpDescInfo dump_op_info = op_desc_info_.at(index);
  182. if (dump_op_info.task_id == task_id && dump_op_info.stream_id == stream_id) {
  183. GELOGI("[Get][OpDescInfo] Find exception op [%s] of task_id: %u, stream_id: %u.",
  184. dump_op_info.op_name.c_str(), task_id, stream_id);
  185. op_desc_info = dump_op_info;
  186. return true;
  187. }
  188. }
  189. return false;
  190. }
  191. Status ExceptionDumper::DumpExceptionInput(const OpDescInfo &op_desc_info, const string &dump_file) const {
  192. GELOGI("[Dump][ExceptionInput] Start to dump exception input");
  193. for (size_t i = 0; i < op_desc_info.input_addrs.size(); i++) {
  194. if (Debug::DumpDevMem(dump_file.data(), op_desc_info.input_addrs.at(i), op_desc_info.input_size.at(i)) != SUCCESS) {
  195. GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed",
  196. i, op_desc_info.op_name.c_str());
  197. return PARAM_INVALID;
  198. }
  199. }
  200. return SUCCESS;
  201. }
  202. Status ExceptionDumper::DumpExceptionOutput(const OpDescInfo &op_desc_info, const string &dump_file) const {
  203. GELOGI("[Dump][ExceptionOutput] Start to dump exception output");
  204. for (size_t i = 0; i < op_desc_info.output_addrs.size(); i++) {
  205. if (Debug::DumpDevMem(dump_file.data(), op_desc_info.output_addrs.at(i), op_desc_info.output_size.at(i)) !=
  206. SUCCESS) {
  207. GELOGE(PARAM_INVALID, "[Dump][ExceptionInput] Dump the %zu input data of op [%s] failed",
  208. i, op_desc_info.op_name.c_str());
  209. return PARAM_INVALID;
  210. }
  211. }
  212. return SUCCESS;
  213. }
  214. OpDescInfo *ExceptionDumper::MutableOpDescInfo(uint32_t task_id, uint32_t stream_id) {
  215. for (OpDescInfo &op_desc_info : op_desc_info_) {
  216. if (op_desc_info.task_id == task_id && op_desc_info.stream_id == stream_id) {
  217. return &op_desc_info;
  218. }
  219. }
  220. return nullptr;
  221. }
  222. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示