You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

csa_interact.h 5.3 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef GE_OMM_CSA_INTERACT_H_
  17. #define GE_OMM_CSA_INTERACT_H_
  18. #include <string>
  19. #include "framework/common/ge_inner_error_codes.h"
  20. namespace ge {
  21. enum JobState {
  22. JOBSTATE_WAITING = 1,
  23. JOBSTATE_RUNNING,
  24. JOBSTATE_KILLING,
  25. JOBSTATE_SUCCEED,
  26. JOBSTATE_FAILED,
  27. JOBSTATE_KILLED,
  28. JOBSTATE_UNKOWN
  29. };
  30. enum JobSubState {
  31. JOBSUBSTATE_ENV_INIT = 201,
  32. JOBSUBSTATE_ENV_FIN,
  33. JOBSUBSTATE_RESOUCE_ALLOC,
  34. JOBSUBSTATE_MODEL_COMPILE,
  35. JOBSUBSTATE_GRAPH_PREPARE,
  36. JOBSUBSTATE_GRAPH_SPLIT,
  37. JOBSUBSTATE_GRAPH_OPTIMIZE,
  38. JOBSUBSTATE_GRAPH_BUILD,
  39. JOBSUBSTATE_GRAPH_LOAD,
  40. JOBSUBSTATE_GRAPH_EXEC,
  41. JOBSUBSTATE_GRAPH_UNLOAD,
  42. JOBSUBSTATE_OTHER
  43. };
  44. enum ErrorModule {
  45. ERROR_MODULE_DRIVER = 0x01,
  46. ERROR_MODULE_RUNTIME = 0x04,
  47. ERROR_MODULE_CCE = 0x06,
  48. ERROR_MODULE_FMK = 0x08,
  49. ERROR_MODULE_HCCL = 0x12
  50. };
  51. struct CsaErrorCode {
  52. CsaErrorCode()
  53. : module_ret_errcode(0),
  54. error_module(ERROR_MODULE_FMK),
  55. job_sub_state(JOBSUBSTATE_OTHER) {}
  56. ~CsaErrorCode() {}
  57. uint32_t module_ret_errcode;
  58. ErrorModule error_module;
  59. JobSubState job_sub_state;
  60. };
  61. class CsaInteract {
  62. public:
  63. ///
  64. /// @brief Obtain CsaInteract instance
  65. /// @return CsaInteract instance
  66. ///
  67. static CsaInteract& GetInstance();
  68. ///
  69. /// @brief CsaInteract instance initialization
  70. /// @param [in] dev_index device index
  71. /// @param [in] job_id job id
  72. /// @return void
  73. ///
  74. void Init(int32_t dev_index, int64_t job_id);
  75. ///
  76. /// @brief Update job state file
  77. /// @param [in] job_state job state
  78. /// @param [in] job_sub_state detailed job state
  79. /// @param [in] module_ret_errcode sub module training failure error code
  80. /// @param [in] error_module error module identified by FMK
  81. /// @return Status
  82. ///
  83. Status WriteJobState(JobState job_state,
  84. JobSubState job_sub_state = JOBSUBSTATE_OTHER,
  85. uint32_t module_ret_errcode = SUCCESS,
  86. ErrorModule error_module = ERROR_MODULE_FMK);
  87. ///
  88. /// @brief Update error code in the job state file
  89. /// @param [in] module_ret_errcode sub module training failure error code
  90. /// @param [in] error_module error module identified by FMK
  91. /// @param [in] job_sub_state detailed job state
  92. /// @return void
  93. ///
  94. void WriteErrorCode(uint32_t module_ret_errcode, ErrorModule error_module,
  95. JobSubState job_sub_state);
  96. ///
  97. /// @brief Record errors that occurred durning the training
  98. /// @param [in] module_ret_errcode sub module training failure error code
  99. /// @param [in] error_module error module identified by FMK
  100. /// @param [in] job_sub_state detailed job state
  101. /// @return void
  102. ///
  103. void StoreInternalErrorCode(uint32_t module_ret_errcode,
  104. ErrorModule error_module,
  105. JobSubState job_sub_state);
  106. ///
  107. /// @brief Update training error code in the job state file
  108. /// @return void
  109. ///
  110. void WriteInternalErrorCode();
  111. ///
  112. /// @brief Update network connectivity detect file
  113. /// @param [in] content network connectivity content
  114. /// @return Status
  115. ///
  116. Status WriteHcomDetection(const std::string& content);
  117. private:
  118. CsaInteract()
  119. : dev_index_(0),
  120. job_id_(0),
  121. is_init_(false),
  122. curr_state_(JOBSTATE_UNKOWN),
  123. is_have_internal_error_(false) {}
  124. ~CsaInteract() {}
  125. CsaInteract(const CsaInteract&) = delete;
  126. CsaInteract(CsaInteract&&) = delete;
  127. CsaInteract& operator=(const CsaInteract&) = delete;
  128. CsaInteract& operator=(CsaInteract&&) = delete;
  129. ///
  130. /// @ingroup WriteFile
  131. /// @brief Write the content into the file. If the file does not exist, create the file
  132. /// @param [in] file_name: File name to be written
  133. /// @param [in] content: Contents to be written
  134. /// @return Status
  135. ///
  136. Status WriteFile(const std::string& file_name, const std::string& content);
  137. ///
  138. /// @ingroup MakePath
  139. /// @brief Verify whether the file path exists, if not, recursively create the folder
  140. /// @param [in] file_name: File name to be verified
  141. /// @return Status
  142. ///
  143. Status MakePath(const std::string& file_name);
  144. // device index
  145. int32_t dev_index_;
  146. // job id
  147. int64_t job_id_;
  148. // is initialization complete
  149. bool is_init_;
  150. // current job state
  151. JobState curr_state_;
  152. // job state file
  153. std::string job_state_file_;
  154. // network connectivity detect file
  155. std::string hcom_detect_file_;
  156. // identification of internal errors that occurred during the training
  157. bool is_have_internal_error_;
  158. // error code information
  159. CsaErrorCode csa_error_code_;
  160. };
  161. } // namespace ge
  162. #endif // GE_OMM_CSA_INTERACT_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示