You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

profiling_definitions.h 6.8 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. /**
  2. * Copyright 2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef AIR_CXX_PROFILING_DEFINITIONS_H
  17. #define AIR_CXX_PROFILING_DEFINITIONS_H
  18. #include <string>
  19. #include <iostream>
  20. #include <mutex>
  21. #include <unordered_map>
  22. #include "graph/profiler.h"
  23. #include "external/ge/ge_api_types.h"
  24. #include "toolchain/prof_callback.h"
  25. namespace ge {
  26. namespace profiling {
  27. enum {
  28. kAclCompileAndExecute,
  29. kAclMatchOpModel,
  30. kAclMatchStaticOpModel,
  31. kAclMatchDynamicOpModel,
  32. kAclExecuteAsync,
  33. kAclLoadSingleOp,
  34. kAclBuildOpModel,
  35. kInferShape,
  36. kTiling,
  37. kUpdateShape,
  38. kConstPrepare,
  39. kInitHybridExecuteArgs,
  40. kInitInferShapeContext,
  41. kDestroyInferShapeContext,
  42. kResetSubgraphExecutor,
  43. kCommitInferShapeTask,
  44. kDeviceToHost,
  45. kPrepareTask,
  46. kLaunchTask,
  47. kCommitTilingTask,
  48. kAtomic,
  49. kKernelLaunchPrepare,
  50. kRtKernelLaunch,
  51. kRtEventCreateRecord,
  52. kRtEventSync,
  53. kRtEventDestroy,
  54. kRtStreamSync,
  55. kOpExecute,
  56. kModelExecute,
  57. kAllocMem,
  58. kCopyH2D,
  59. kPrepareNode,
  60. kWaitForPrepareDone,
  61. kPropgateOutputs,
  62. kOnNodeDoneCallback,
  63. kValidateInputTensor,
  64. kAfterExecuted,
  65. kRtEventSychronize,
  66. kInferShapeWaitDependShape,
  67. kInferShapeWaitInputTensor,
  68. kInferShapeCallInferFunc,
  69. kInferShapePropgate,
  70. // v2 control node
  71. kSelectBranch,
  72. kExecuteSubGraph,
  73. kInitSubGraphExecutor,
  74. // fuzz compile
  75. kSelectBin,
  76. kFindCompileCache,
  77. kAddCompileCache,
  78. kFuzzCompileOp,
  79. kCalcRuningParam,
  80. kGenTask,
  81. kRegisterBin,
  82. // FFTS Plus
  83. kFftsPlusPreThread,
  84. kFftsPlusNodeThread,
  85. kFftsPlusInferShape,
  86. kOpFftsCalculateV2,
  87. kInitThreadRunInfo,
  88. kFftsPlusGraphSchedule,
  89. kKnownGetAddrAndPrefCnt,
  90. kKernelGetAddrAndPrefCnt,
  91. kUpdateAddrAndPrefCnt,
  92. kInitOpRunInfo,
  93. kGetAutoThreadParam,
  94. kAllocateOutputs,
  95. kAllocateWorkspaces,
  96. kInitTaskAddrs,
  97. kInitThreadRunParam,
  98. kUpdateTaskAndCache,
  99. kFftsPlusTaskLaunch,
  100. // Add new definitions here
  101. kProfilingIndexEnd
  102. };
  103. constexpr uint64_t kInvalidHashId = 0UL;
  104. class ProfilingContext {
  105. public:
  106. static bool IsDumpToStdEnabled();
  107. static ProfilingContext &GetInstance();
  108. ProfilingContext();
  109. ~ProfilingContext();
  110. /*
  111. * 还有一种思路是`IsEnabled`只判断profiler_是否为空指针,不再设置单独的enabled标记位,这样可以少一个标记位。
  112. * 但是这么做就意味着,profiler_实例在未使能profiling时,必须是空指针状态。
  113. * 为了性能考虑,profiling机制在编译和加载时,就会调用`RegisterString`,向profiler_注册字符串,后续执行时,只会使用注册好的index了。
  114. * 因此存在一种场景:编译时并未使能profiling(因为编译时间很长,使能profiling也无法真实反应执行时的耗时状态),
  115. * 因此编译时注册字符串的动作并没有生效。在执行时,动态的打开了profiling,这种场景下,执行时无法拿到注册后字符串
  116. */
  117. bool IsEnabled() const noexcept {
  118. return enabled_ && (profiler_ != nullptr);
  119. }
  120. void SetEnable() noexcept {
  121. enabled_ = true;
  122. }
  123. void SetDisable() noexcept {
  124. enabled_ = false;
  125. }
  126. void RecordCurrentThread(const int64_t element, const int64_t event, const EventType et,
  127. const std::chrono::time_point<std::chrono::system_clock> time_point) {
  128. if (IsEnabled()) {
  129. profiler_->RecordCurrentThread(element, event, et, time_point);
  130. }
  131. }
  132. void RecordCurrentThread(const int64_t element, const int64_t event, const EventType et) {
  133. RecordCurrentThread(element, event, et, std::chrono::system_clock::now());
  134. }
  135. const Profiler *GetProfiler() const {
  136. return profiler_.get();
  137. }
  138. void Dump(std::ostream &out_stream) const {
  139. if (IsEnabled()) {
  140. profiler_->Dump(out_stream);
  141. } else {
  142. out_stream << "Profiling not enable, skip to dump" << std::endl;
  143. }
  144. }
  145. void DumpToStdOut() const {
  146. Dump(std::cout);
  147. }
  148. void Reset() {
  149. if (IsEnabled()) {
  150. profiler_->Reset();
  151. }
  152. }
  153. int64_t RegisterString(const std::string &str);
  154. int64_t RegisterStringHash(const uint64_t hash_id, const std::string &str);
  155. void UpdateElementHashId();
  156. static Status QueryHashId(const std::string &src_str, uint64_t &hash_id);
  157. size_t GetRegisterStringNum() const {
  158. return strings_to_index_.size();
  159. }
  160. void Init();
  161. private:
  162. void UpdateHashByStr(const std::string &str, const uint64_t hash);
  163. private:
  164. bool inited_;
  165. bool enabled_;
  166. int64_t str_index_;
  167. std::unordered_map<std::string, int64_t> strings_to_index_;
  168. std::mutex strings_to_index_mutex_;
  169. std::unique_ptr<Profiler> profiler_;
  170. };
  171. class ScopeProfiler {
  172. public:
  173. ScopeProfiler(const int64_t element, const int64_t event) : element_(element), event_(event) {
  174. if (ProfilingContext::GetInstance().IsEnabled()) {
  175. start_trace_ = std::chrono::system_clock::now();
  176. }
  177. }
  178. ~ScopeProfiler() {
  179. if (ProfilingContext::GetInstance().IsEnabled()) {
  180. ProfilingContext::GetInstance().RecordCurrentThread(element_, event_, EventType::kEventStart, start_trace_);
  181. ProfilingContext::GetInstance().RecordCurrentThread(element_, event_, EventType::kEventEnd);
  182. }
  183. }
  184. void SetElement(const int64_t element) {
  185. element_ = element;
  186. }
  187. private:
  188. std::chrono::time_point<std::chrono::system_clock> start_trace_;
  189. int64_t element_;
  190. int64_t event_;
  191. };
  192. } // namespace profiling
  193. } // namespace ge
  194. #define PROFILING_START(element, event) \
  195. ge::profiling::ProfilingContext::GetInstance().RecordCurrentThread((element), (event), \
  196. ge::profiling::EventType::kEventStart)
  197. #define PROFILING_END(element, event) \
  198. ge::profiling::ProfilingContext::GetInstance().RecordCurrentThread((element), (event), \
  199. ge::profiling::EventType::kEventEnd)
  200. #define PROFILING_SCOPE(element, event) ge::profiling::ScopeProfiler profiler((element), (event))
  201. #define PROFILING_SCOPE_CONST(element, event) const ge::profiling::ScopeProfiler profiler((element), (event))
  202. #define PROFILING_SCOPE_ELEMENT(element) profiler.SetElement((element))
  203. #endif // AIR_CXX_PROFILING_DEFINITIONS_H

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示