You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dump_op.cc 14 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "common/dump/dump_op.h"
  17. #include "common/dump/dump_manager.h"
  18. #include "common/ge/datatype_util.h"
  19. #include "framework/common/debug/ge_log.h"
  20. #include "framework/common/util.h"
  21. #include "framework/common/types.h"
  22. #include "graph/anchor.h"
  23. #include "graph/ge_tensor.h"
  24. #include "graph/op_desc.h"
  25. #include "graph/utils/tensor_utils.h"
  26. #include "proto/ge_ir.pb.h"
  27. #include "proto/op_mapping.pb.h"
  28. #include "runtime/mem.h"
  29. #include "aicpu/common/aicpu_task_struct.h"
  30. namespace {
  31. const uint32_t kAicpuLoadFlag = 1;
  32. const char *const kDumpOutput = "output";
  33. const char *const kDumpInput = "input";
  34. const char *const kDumpAll = "all";
  35. const char *const kDumpKernelsDumpOp = "DumpDataInfo";
  36. } // namespace
  37. namespace ge {
  38. DumpOp::~DumpOp() {
  39. if (proto_dev_mem_ != nullptr) {
  40. (void)rtFree(proto_dev_mem_);
  41. }
  42. if (proto_size_dev_mem_ != nullptr) {
  43. (void)rtFree(proto_size_dev_mem_);
  44. }
  45. proto_dev_mem_ = nullptr;
  46. proto_size_dev_mem_ = nullptr;
  47. }
  48. void DumpOp::SetLoopAddr(void *global_step, void *loop_per_iter, void *loop_cond) {
  49. global_step_ = reinterpret_cast<uintptr_t>(global_step);
  50. loop_per_iter_ = reinterpret_cast<uintptr_t>(loop_per_iter);
  51. loop_cond_ = reinterpret_cast<uintptr_t>(loop_cond);
  52. }
  53. void DumpOp::SetDynamicModelInfo(const string &dynamic_model_name, const string &dynamic_om_name,
  54. uint32_t dynamic_model_id) {
  55. dynamic_model_name_ = dynamic_model_name;
  56. dynamic_om_name_ = dynamic_om_name;
  57. dynamic_model_id_ = dynamic_model_id;
  58. }
  59. static void SetOpMappingLoopAddr(uintptr_t step_id, uintptr_t loop_per_iter, uintptr_t loop_cond,
  60. toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) {
  61. if (step_id != 0) {
  62. GELOGI("Exists step_id.");
  63. op_mapping_info.set_step_id_addr(static_cast<uint64_t>(step_id));
  64. } else {
  65. GELOGI("step_id is null.");
  66. }
  67. if (loop_per_iter != 0) {
  68. GELOGI("Exists loop_per_iter.");
  69. op_mapping_info.set_iterations_per_loop_addr(static_cast<uint64_t>(loop_per_iter));
  70. } else {
  71. GELOGI("loop_per_iter is null.");
  72. }
  73. if (loop_cond != 0) {
  74. GELOGI("Exists loop_cond.");
  75. op_mapping_info.set_loop_cond_addr(static_cast<uint64_t>(loop_cond));
  76. } else {
  77. GELOGI("loop_cond is null.");
  78. }
  79. }
  80. Status DumpOp::DumpOutput(toolkit::aicpu::dump::Task &task) {
  81. GELOGI("Start dump output in Launch dump op");
  82. const auto &output_descs = op_desc_->GetAllOutputsDesc();
  83. for (size_t i = 0; i < output_descs.size(); ++i) {
  84. toolkit::aicpu::dump::Output output;
  85. output.set_data_type(static_cast<int32_t>(DataTypeUtil::GetIrDataType(output_descs.at(i).GetDataType())));
  86. output.set_format(static_cast<int32_t>(output_descs.at(i).GetFormat()));
  87. for (auto dim : output_descs.at(i).GetShape().GetDims()) {
  88. output.mutable_shape()->add_dim(dim);
  89. }
  90. for (auto dim : output_descs.at(i).GetOriginShape().GetDims()) {
  91. output.mutable_origin_shape()->add_dim(dim);
  92. }
  93. int64_t output_size = 0;
  94. if (TensorUtils::GetTensorSizeInBytes(output_descs.at(i), output_size) != SUCCESS) {
  95. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Get][TensorSize]Failed, output %zu, node %s(%s),",
  96. i, op_desc_->GetName().c_str(), op_desc_->GetType().c_str());
  97. REPORT_CALL_ERROR("E19999", "Get output %zu tensor size of node %s(%s) failed",
  98. i, op_desc_->GetName().c_str(), op_desc_->GetType().c_str());
  99. return ACL_ERROR_GE_INTERNAL_ERROR;
  100. }
  101. GELOGD("Get output size in lanch dump op is %ld", output_size);
  102. output.set_size(output_size);
  103. output.set_address(static_cast<uint64_t>(output_addrs_[i]));
  104. task.mutable_output()->Add(std::move(output));
  105. }
  106. return SUCCESS;
  107. }
  108. Status DumpOp::DumpInput(toolkit::aicpu::dump::Task &task) {
  109. GELOGI("Start dump input in Launch dump op");
  110. const auto &input_descs = op_desc_->GetAllInputsDesc();
  111. for (size_t i = 0; i < input_descs.size(); ++i) {
  112. toolkit::aicpu::dump::Input input;
  113. input.set_data_type(static_cast<int32_t>(DataTypeUtil::GetIrDataType(input_descs.at(i).GetDataType())));
  114. input.set_format(static_cast<int32_t>(input_descs.at(i).GetFormat()));
  115. for (auto dim : input_descs.at(i).GetShape().GetDims()) {
  116. input.mutable_shape()->add_dim(dim);
  117. }
  118. for (auto dim : input_descs.at(i).GetOriginShape().GetDims()) {
  119. input.mutable_origin_shape()->add_dim(dim);
  120. }
  121. int64_t input_size = 0;
  122. if (TensorUtils::GetTensorSizeInBytes(input_descs.at(i), input_size) != SUCCESS) {
  123. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Get][TensorSize]Failed, input %zu, node %s(%s)",
  124. i, op_desc_->GetName().c_str(), op_desc_->GetType().c_str());
  125. REPORT_CALL_ERROR("E19999", "Get input %zu tensor size of node %s(%s) failed",
  126. i, op_desc_->GetName().c_str(), op_desc_->GetType().c_str());
  127. return ACL_ERROR_GE_INTERNAL_ERROR;
  128. }
  129. GELOGD("Get input size in lanch dump op is %ld", input_size);
  130. input.set_size(input_size);
  131. input.set_address(static_cast<uint64_t>(input_addrs_[i]));
  132. task.mutable_input()->Add(std::move(input));
  133. }
  134. return SUCCESS;
  135. }
  136. void DumpOp::SetDumpInfo(const DumpProperties &dump_properties, const OpDescPtr &op_desc, vector<uintptr_t> input_addrs,
  137. vector<uintptr_t> output_addrs, rtStream_t stream) {
  138. dump_properties_ = dump_properties;
  139. op_desc_ = op_desc;
  140. input_addrs_ = input_addrs;
  141. output_addrs_ = output_addrs;
  142. stream_ = stream;
  143. }
  144. Status DumpOp::ExecutorDumpOp(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) {
  145. std::string proto_msg;
  146. size_t proto_size = op_mapping_info.ByteSizeLong();
  147. bool ret = op_mapping_info.SerializeToString(&proto_msg);
  148. if (!ret || proto_size == 0) {
  149. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Serialize][Protobuf]Failed, proto_size is %zu",
  150. proto_size);
  151. REPORT_CALL_ERROR("E19999", "[Serialize][Protobuf]Failed, proto_size is %zu", proto_size);
  152. return ACL_ERROR_GE_INTERNAL_ERROR;
  153. }
  154. rtError_t rt_ret = rtMalloc(&proto_dev_mem_, proto_size, RT_MEMORY_HBM);
  155. if (rt_ret != RT_ERROR_NONE) {
  156. GELOGE(rt_ret, "[Call][rtMalloc]Failed, ret: 0x%X", rt_ret);
  157. REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, ret: 0x%X", rt_ret);
  158. return RT_ERROR_TO_GE_STATUS(rt_ret);
  159. }
  160. rt_ret = rtMemcpy(proto_dev_mem_, proto_size, proto_msg.c_str(), proto_size, RT_MEMCPY_HOST_TO_DEVICE);
  161. if (rt_ret != RT_ERROR_NONE) {
  162. GELOGE(rt_ret, "[Call][rtMemcpy]Failed, ret: 0x%X", rt_ret);
  163. REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, ret: 0x%X", rt_ret);
  164. return RT_ERROR_TO_GE_STATUS(rt_ret);
  165. }
  166. rt_ret = rtMalloc(&proto_size_dev_mem_, sizeof(size_t), RT_MEMORY_HBM);
  167. if (rt_ret != RT_ERROR_NONE) {
  168. GELOGE(rt_ret, "[Call][rtMalloc]Failed, ret: 0x%X", rt_ret);
  169. REPORT_CALL_ERROR("E19999", "Call rtMalloc failed, ret: 0x%X", rt_ret);
  170. return RT_ERROR_TO_GE_STATUS(rt_ret);
  171. }
  172. rt_ret = rtMemcpy(proto_size_dev_mem_, sizeof(size_t), &proto_size, sizeof(size_t), RT_MEMCPY_HOST_TO_DEVICE);
  173. if (rt_ret != RT_ERROR_NONE) {
  174. GELOGE(rt_ret, "[Call][rtMemcpy]Failed, ret 0x%X", rt_ret);
  175. REPORT_CALL_ERROR("E19999", "Call rtMemcpy failed, ret 0x%X", rt_ret);
  176. return RT_ERROR_TO_GE_STATUS(rt_ret);
  177. }
  178. constexpr int32_t io_addr_num = 2;
  179. constexpr uint32_t args_size = sizeof(aicpu::AicpuParamHead) + io_addr_num * sizeof(uint64_t);
  180. char args[args_size] = {0};
  181. auto param_head = reinterpret_cast<aicpu::AicpuParamHead *>(args);
  182. param_head->length = args_size;
  183. param_head->ioAddrNum = io_addr_num;
  184. auto io_addr = reinterpret_cast<uint64_t *>(args + sizeof(aicpu::AicpuParamHead));
  185. io_addr[0] = reinterpret_cast<uintptr_t>(proto_dev_mem_);
  186. io_addr[1] = reinterpret_cast<uintptr_t>(proto_size_dev_mem_);
  187. rt_ret = rtCpuKernelLaunch(nullptr, kDumpKernelsDumpOp,
  188. 1, // blockDim default 1
  189. args, args_size,
  190. nullptr, // no need smDesc
  191. stream_);
  192. if (rt_ret != RT_ERROR_NONE) {
  193. GELOGE(rt_ret, "[Call][rtCpuKernelLaunch]Failed, ret 0x%X", rt_ret);
  194. REPORT_CALL_ERROR("E19999", "Call rtCpuKernelLaunch failed, ret 0x%X", rt_ret);
  195. return RT_ERROR_TO_GE_STATUS(rt_ret);
  196. }
  197. GELOGI("Kernel launch dump op success");
  198. return SUCCESS;
  199. }
  200. Status DumpOp::SetDumpModelName(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) {
  201. if (dynamic_model_name_.empty() && dynamic_om_name_.empty()) {
  202. GELOGI("Single op dump, no need set model name");
  203. return SUCCESS;
  204. }
  205. std::set<std::string> model_list = dump_properties_.GetAllDumpModel();
  206. bool not_find_by_omname = model_list.find(dynamic_om_name_) == model_list.end();
  207. bool not_find_by_modelname = model_list.find(dynamic_model_name_) == model_list.end();
  208. std::string dump_model_name = not_find_by_omname ? dynamic_model_name_ : dynamic_om_name_;
  209. if (model_list.find(DUMP_ALL_MODEL) == model_list.end()) {
  210. if (not_find_by_omname && not_find_by_modelname) {
  211. std::string model_list_str;
  212. for (auto &model : model_list) {
  213. model_list_str += "[" + model + "].";
  214. }
  215. GELOGW("Model %s will not be set to dump, dump list: %s", dump_model_name.c_str(), model_list_str.c_str());
  216. return FAILED;
  217. }
  218. }
  219. if (!dump_model_name.empty() && dump_properties_.IsDumpOpen()) {
  220. GELOGI("Dump model name is %s", dump_model_name.c_str());
  221. op_mapping_info.set_model_name(dump_model_name);
  222. }
  223. return SUCCESS;
  224. }
  225. Status DumpOp::LaunchDumpOp() {
  226. GELOGI("Start to launch dump op %s", op_desc_->GetName().c_str());
  227. int32_t device_id = 0;
  228. rtError_t rt_ret = rtGetDevice(&device_id);
  229. if (rt_ret != RT_ERROR_NONE) {
  230. GELOGE(rt_ret, "[Call][rtGetDevice]Failed, ret 0x%X", rt_ret);
  231. REPORT_CALL_ERROR("E19999", "[Call][rtGetDevice]Failed, ret 0x%X", rt_ret);
  232. return RT_ERROR_TO_GE_STATUS(rt_ret);
  233. }
  234. if (device_id < 0) {
  235. GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][DeviceId]Failed, device_id %d", device_id);
  236. REPORT_INNER_ERROR("E19999", "Check device_id %d failed", device_id);
  237. return ACL_ERROR_GE_INTERNAL_ERROR;
  238. }
  239. toolkit::aicpu::dump::OpMappingInfo op_mapping_info;
  240. auto dump_path = dump_properties_.GetDumpPath() + std::to_string(device_id) + "/";
  241. op_mapping_info.set_dump_path(dump_path);
  242. op_mapping_info.set_flag(kAicpuLoadFlag);
  243. op_mapping_info.set_dump_step(dump_properties_.GetDumpStep());
  244. op_mapping_info.set_model_id(dynamic_model_id_);
  245. if (SetDumpModelName(op_mapping_info) != SUCCESS) {
  246. return SUCCESS;
  247. }
  248. SetOpMappingLoopAddr(global_step_, loop_per_iter_, loop_cond_, op_mapping_info);
  249. GELOGI("Dump step is %s ,dump path is %s in Launch dump op", dump_properties_.GetDumpStep().c_str(),
  250. dump_path.c_str());
  251. uint32_t task_id = 0;
  252. uint32_t stream_id = 0;
  253. rt_ret = rtGetTaskIdAndStreamID(&task_id, &stream_id);
  254. if (rt_ret != RT_ERROR_NONE) {
  255. GELOGW("call rtGetTaskIdAndStreamID failed, ret = 0x%X", rt_ret);
  256. }
  257. toolkit::aicpu::dump::Task task;
  258. task.set_task_id(task_id);
  259. task.set_stream_id(stream_id);
  260. task.mutable_op()->set_op_name(op_desc_->GetName());
  261. task.mutable_op()->set_op_type(op_desc_->GetType());
  262. if (dump_properties_.GetDumpMode() == kDumpOutput) {
  263. auto ret = DumpOutput(task);
  264. if (ret != SUCCESS) {
  265. GELOGE(ret, "[Dump][Output]Failed, node %s(%s), ret 0x%X",
  266. op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
  267. REPORT_CALL_ERROR("E19999", "Dump Output failed, node %s(%s), ret 0x%X",
  268. op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
  269. return ret;
  270. }
  271. op_mapping_info.mutable_task()->Add(std::move(task));
  272. }
  273. if (dump_properties_.GetDumpMode() == kDumpInput) {
  274. auto ret = DumpInput(task);
  275. if (ret != SUCCESS) {
  276. GELOGE(ret, "[Dump][Input]Failed, node %s(%s), ret 0x%X",
  277. op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
  278. REPORT_CALL_ERROR("E19999", "Dump Input failed, node %s(%s), ret 0x%X",
  279. op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
  280. return ret;
  281. }
  282. op_mapping_info.mutable_task()->Add(std::move(task));
  283. }
  284. if (dump_properties_.GetDumpMode() == kDumpAll || dump_properties_.IsOpDebugOpen()) {
  285. auto ret = DumpOutput(task);
  286. if (ret != SUCCESS) {
  287. GELOGE(ret, "[Dump][Output]Failed when in dumping all, node %s(%s), ret 0x%X",
  288. op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
  289. REPORT_CALL_ERROR("E19999", "Dump Output failed when in dumping all, node %s(%s), ret 0x%X",
  290. op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
  291. return ret;
  292. }
  293. ret = DumpInput(task);
  294. if (ret != SUCCESS) {
  295. GELOGE(ret, "[Dump][Input]Failed when in dumping all, node %s(%s), ret 0x%X",
  296. op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
  297. REPORT_CALL_ERROR("E19999", "Dump Input failed when in dumping all, node %s(%s), ret 0x%X",
  298. op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
  299. return ret;
  300. }
  301. op_mapping_info.mutable_task()->Add(std::move(task));
  302. }
  303. auto ret = ExecutorDumpOp(op_mapping_info);
  304. if (ret != SUCCESS) {
  305. GELOGE(ret, "[Dump][Op]Failed, ret 0x%X", ret);
  306. REPORT_CALL_ERROR("E19999", "Executor dump op failed, ret 0x%X", ret);
  307. return ret;
  308. }
  309. return SUCCESS;
  310. }
  311. } // namespace ge

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示