You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling_definitions.h 6.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. /**
  2. * Copyright 2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef AIR_CXX_PROFILING_DEFINITIONS_H
  17. #define AIR_CXX_PROFILING_DEFINITIONS_H
  18. #include <string>
  19. #include <iostream>
  20. #include <mutex>
  21. #include <unordered_map>
  22. #include "graph/profiler.h"
  23. #include "external/ge/ge_api_types.h"
  24. #include "toolchain/prof_callback.h"
  25. namespace ge {
  26. namespace profiling {
  27. enum {
  28. kAclCompileAndExecute,
  29. kAclMatchOpModel,
  30. kAclMatchStaticOpModel,
  31. kAclMatchDynamicOpModel,
  32. kAclExecuteAsync,
  33. kAclLoadSingleOp,
  34. kAclBuildOpModel,
  35. kInferShape,
  36. kTiling,
  37. kUpdateShape,
  38. kConstPrepare,
  39. kInitHybridExecuteArgs,
  40. kInitInferShapeContext,
  41. kDestroyInferShapeContext,
  42. kResetSubgraphExecutor,
  43. kCommitInferShapeTask,
  44. kDeviceToHost,
  45. kPrepareTask,
  46. kLaunchTask,
  47. kCommitTilingTask,
  48. kAtomic,
  49. kKernelLaunchPrepare,
  50. kRtKernelLaunch,
  51. kRtEventCreateRecord,
  52. kRtEventSync,
  53. kRtEventDestroy,
  54. kRtStreamSync,
  55. kOpExecute,
  56. kModelExecute,
  57. kAllocMem,
  58. kCopyH2D,
  59. kPrepareNode,
  60. kWaitForPrepareDone,
  61. kPropgateOutputs,
  62. kOnNodeDoneCallback,
  63. kValidateInputTensor,
  64. kAfterExecuted,
  65. kRtEventSychronize,
  66. kInferShapeWaitDependShape,
  67. kInferShapeWaitInputTensor,
  68. kInferShapeCallInferFunc,
  69. kInferShapePropgate,
  70. // v2 control node
  71. kSelectBranch,
  72. kExecuteSubGraph,
  73. kInitSubGraphExecutor,
  74. // fuzz compile
  75. kSelectBin,
  76. kFindCompileCache,
  77. kAddCompileCache,
  78. kFuzzCompileOp,
  79. kCalcRuningParam,
  80. kGenTask,
  81. kRegisterBin,
  82. // Add new definitions here
  83. kProfilingIndexEnd
  84. };
  85. constexpr uint64_t kInvalidHashId = 0UL;
  86. class ProfilingContext {
  87. public:
  88. static bool IsDumpToStdEnabled();
  89. static ProfilingContext &GetInstance();
  90. ProfilingContext();
  91. ~ProfilingContext();
  92. /*
  93. * 还有一种思路是`IsEnabled`只判断profiler_是否为空指针,不再设置单独的enabled标记位,这样可以少一个标记位。
  94. * 但是这么做就意味着,profiler_实例在未使能profiling时,必须是空指针状态。
  95. * 为了性能考虑,profiling机制在编译和加载时,就会调用`RegisterString`,向profiler_注册字符串,后续执行时,只会使用注册好的index了。
  96. * 因此存在一种场景:编译时并未使能profiling(因为编译时间很长,使能profiling也无法真实反应执行时的耗时状态),
  97. * 因此编译时注册字符串的动作并没有生效。在执行时,动态的打开了profiling,这种场景下,执行时无法拿到注册后字符串
  98. */
  99. bool IsEnabled() const noexcept {
  100. return enabled_ && (profiler_ != nullptr);
  101. }
  102. void SetEnable() noexcept {
  103. enabled_ = true;
  104. }
  105. void SetDisable() noexcept {
  106. enabled_ = false;
  107. }
  108. void RecordCurrentThread(const int64_t element, const int64_t event, const EventType et,
  109. const std::chrono::time_point<std::chrono::system_clock> time_point) {
  110. if (IsEnabled()) {
  111. profiler_->RecordCurrentThread(element, event, et, time_point);
  112. }
  113. }
  114. void RecordCurrentThread(const int64_t element, const int64_t event, const EventType et) {
  115. RecordCurrentThread(element, event, et, std::chrono::system_clock::now());
  116. }
  117. const Profiler *GetProfiler() const {
  118. return profiler_.get();
  119. }
  120. void Dump(std::ostream &out_stream) const {
  121. if (IsEnabled()) {
  122. profiler_->Dump(out_stream);
  123. } else {
  124. out_stream << "Profiling not enable, skip to dump" << std::endl;
  125. }
  126. }
  127. void DumpToStdOut() const {
  128. Dump(std::cout);
  129. }
  130. void Reset() {
  131. if (IsEnabled()) {
  132. profiler_->Reset();
  133. }
  134. }
  135. int64_t RegisterString(const std::string &str);
  136. int64_t RegisterStringHash(const uint64_t hash_id, const std::string &str);
  137. void UpdateElementHashId(const MsprofReporterCallback reporter_callback);
  138. static Status QueryHashId(const MsprofReporterCallback reporter_callback, const std::string &src_str,
  139. uint64_t &hash_id);
  140. size_t GetRegisterStringNum() const {
  141. return strings_to_index_.size();
  142. }
  143. void Init();
  144. private:
  145. void UpdateHashByStr(const std::string &str, const uint64_t hash);
  146. private:
  147. bool inited_;
  148. bool enabled_;
  149. int64_t str_index_;
  150. std::unordered_map<std::string, int64_t> strings_to_index_;
  151. std::mutex strings_to_index_mutex_;
  152. std::unique_ptr<Profiler> profiler_;
  153. };
  154. class ScopeProfiler {
  155. public:
  156. ScopeProfiler(const int64_t element, const int64_t event) : element_(element), event_(event) {
  157. if (ProfilingContext::GetInstance().IsEnabled()) {
  158. start_trace_ = std::chrono::system_clock::now();
  159. }
  160. }
  161. ~ScopeProfiler() {
  162. if (ProfilingContext::GetInstance().IsEnabled()) {
  163. ProfilingContext::GetInstance().RecordCurrentThread(element_, event_, EventType::kEventStart, start_trace_);
  164. ProfilingContext::GetInstance().RecordCurrentThread(element_, event_, EventType::kEventEnd);
  165. }
  166. }
  167. void SetElement(const int64_t element) {
  168. element_ = element;
  169. }
  170. private:
  171. std::chrono::time_point<std::chrono::system_clock> start_trace_;
  172. int64_t element_;
  173. int64_t event_;
  174. };
  175. } // namespace profiling
  176. } // namespace ge
  177. #define PROFILING_START(element, event) \
  178. ge::profiling::ProfilingContext::GetInstance().RecordCurrentThread((element), (event), \
  179. ge::profiling::EventType::kEventStart)
  180. #define PROFILING_END(element, event) \
  181. ge::profiling::ProfilingContext::GetInstance().RecordCurrentThread((element), (event), \
  182. ge::profiling::EventType::kEventEnd)
  183. #define PROFILING_SCOPE(element, event) ge::profiling::ScopeProfiler profiler((element), (event))
  184. #define PROFILING_SCOPE_ELEMENT(element) profiler.SetElement((element))
  185. #endif // AIR_CXX_PROFILING_DEFINITIONS_H

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示