You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

prof_common.h 14 kB


  1. /**
  2. * @file prof_common.h
  3. *
  4. * Copyright (c) Huawei Technologies Co., Ltd. 2019-2022. All rights reserved.
  5. *
  6. * This program is distributed in the hope that it will be useful,
  7. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  9. *
  10. */
  11. #ifndef MSPROFILER_PROF_COMMON_H_
  12. #define MSPROFILER_PROF_COMMON_H_
  13. #include <stdint.h>
  14. #ifdef __cplusplus
  15. extern "C" {
  16. #endif // __cplusplus
  17. #define MSPROF_DATA_HEAD_MAGIC_NUM 0x5a5a
  18. enum MsprofDataTag {
  19. MSPROF_ACL_DATA_TAG = 0, // acl data tag, range: 0~19
  20. MSPROF_GE_DATA_TAG_MODEL_LOAD = 20, // ge data tag, range: 20~39
  21. MSPROF_GE_DATA_TAG_FUSION = 21,
  22. MSPROF_GE_DATA_TAG_INFER = 22,
  23. MSPROF_GE_DATA_TAG_TASK = 23,
  24. MSPROF_GE_DATA_TAG_TENSOR = 24,
  25. MSPROF_GE_DATA_TAG_STEP = 25,
  26. MSPROF_GE_DATA_TAG_ID_MAP = 26,
  27. MSPROF_GE_DATA_TAG_HOST_SCH = 27,
  28. MSPROF_RUNTIME_DATA_TAG_API = 40, // runtime data tag, range: 40~59
  29. MSPROF_RUNTIME_DATA_TAG_TRACK = 41,
  30. MSPROF_AICPU_DATA_TAG = 60, // aicpu data tag, range: 60~79
  31. MSPROF_AICPU_MODEL_TAG = 61,
  32. MSPROF_HCCL_DATA_TAG = 80, // hccl data tag, range: 80~99
  33. MSPROF_DP_DATA_TAG = 100, // dp data tag, range: 100~119
  34. MSPROF_MSPROFTX_DATA_TAG = 120, // hccl data tag, range: 120~139
  35. MSPROF_DATA_TAG_MAX = 65536, // data tag value type is uint16_t
  36. };
  37. /**
  38. * @brief struct of mixed data
  39. */
  40. #define MSPROF_MIX_DATA_RESERVE_BYTES 7
  41. #define MSPROF_MIX_DATA_STRING_LEN 120
  42. enum MsprofMixDataType {
  43. MSPROF_MIX_DATA_HASH_ID = 0,
  44. MSPROF_MIX_DATA_STRING,
  45. };
  46. struct MsprofMixData {
  47. uint8_t type; // MsprofMixDataType
  48. uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES];
  49. union {
  50. uint64_t hashId;
  51. char dataStr[MSPROF_MIX_DATA_STRING_LEN];
  52. } data;
  53. };
  54. #define PATH_LEN_MAX 1023
  55. #define PARAM_LEN_MAX 4095
  56. struct MsprofCommandHandleParams {
  57. uint32_t pathLen;
  58. uint32_t storageLimit; // MB
  59. uint32_t profDataLen;
  60. char path[PATH_LEN_MAX + 1];
  61. char profData[PARAM_LEN_MAX + 1];
  62. };
  63. /**
  64. * @brief profiling command info
  65. */
  66. #define MSPROF_MAX_DEV_NUM 64
  67. struct MsprofCommandHandle {
  68. uint64_t profSwitch;
  69. uint64_t profSwitchHi;
  70. uint32_t devNums;
  71. uint32_t devIdList[MSPROF_MAX_DEV_NUM];
  72. uint32_t modelId;
  73. uint32_t type;
  74. struct MsprofCommandHandleParams params;
  75. };
  76. /**
  77. * @brief struct of data reported by acl
  78. */
  79. #define MSPROF_ACL_DATA_RESERVE_BYTES 32
  80. #define MSPROF_ACL_API_NAME_LEN 64
  81. enum MsprofAclApiType {
  82. MSPROF_ACL_API_TYPE_OP = 1,
  83. MSPROF_ACL_API_TYPE_MODEL,
  84. MSPROF_ACL_API_TYPE_RUNTIME,
  85. MSPROF_ACL_API_TYPE_OTHERS,
  86. };
  87. struct MsprofAclProfData {
  88. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  89. uint16_t dataTag = MSPROF_ACL_DATA_TAG;
  90. uint32_t apiType; // enum MsprofAclApiType
  91. uint64_t beginTime;
  92. uint64_t endTime;
  93. uint32_t processId;
  94. uint32_t threadId;
  95. char apiName[MSPROF_ACL_API_NAME_LEN];
  96. uint8_t reserve[MSPROF_ACL_DATA_RESERVE_BYTES];
  97. };
  98. /**
  99. * @brief struct of data reported by GE
  100. */
  101. #define MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES 104
  102. struct MsprofGeProfModelLoadData {
  103. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  104. uint16_t dataTag = MSPROF_GE_DATA_TAG_MODEL_LOAD;
  105. uint32_t modelId;
  106. MsprofMixData modelName;
  107. uint64_t startTime;
  108. uint64_t endTime;
  109. uint8_t reserve[MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES];
  110. };
  111. #define MSPROF_GE_FUSION_DATA_RESERVE_BYTES 8
  112. #define MSPROF_GE_FUSION_OP_NUM 8
  113. struct MsprofGeProfFusionData {
  114. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  115. uint16_t dataTag = MSPROF_GE_DATA_TAG_FUSION;
  116. uint32_t modelId;
  117. MsprofMixData fusionName;
  118. uint64_t inputMemSize;
  119. uint64_t outputMemSize;
  120. uint64_t weightMemSize;
  121. uint64_t workspaceMemSize;
  122. uint64_t totalMemSize;
  123. uint64_t fusionOpNum;
  124. uint64_t fusionOp[MSPROF_GE_FUSION_OP_NUM];
  125. uint8_t reserve[MSPROF_GE_FUSION_DATA_RESERVE_BYTES];
  126. };
  127. #define MSPROF_GE_INFER_DATA_RESERVE_BYTES 64
  128. struct MsprofGeProfInferData {
  129. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  130. uint16_t dataTag = MSPROF_GE_DATA_TAG_INFER;
  131. uint32_t modelId;
  132. MsprofMixData modelName;
  133. uint32_t requestId;
  134. uint32_t threadId;
  135. uint64_t inputDataStartTime;
  136. uint64_t inputDataEndTime;
  137. uint64_t inferStartTime;
  138. uint64_t inferEndTime;
  139. uint64_t outputDataStartTime;
  140. uint64_t outputDataEndTime;
  141. uint8_t reserve[MSPROF_GE_INFER_DATA_RESERVE_BYTES];
  142. };
  143. constexpr int32_t MSPROF_GE_TASK_DATA_RESERVE_BYTES = 12;
  144. #define MSPROF_GE_OP_TYPE_LEN 56
  145. enum MsprofGeTaskType {
  146. MSPROF_GE_TASK_TYPE_AI_CORE = 0,
  147. MSPROF_GE_TASK_TYPE_AI_CPU,
  148. MSPROF_GE_TASK_TYPE_AIV,
  149. MSPROF_GE_TASK_TYPE_WRITE_BACK,
  150. MSPROF_GE_TASK_TYPE_INVALID
  151. };
  152. enum MsprofGeShapeType {
  153. MSPROF_GE_SHAPE_TYPE_STATIC = 0,
  154. MSPROF_GE_SHAPE_TYPE_DYNAMIC,
  155. };
  156. struct MsprofGeOpType {
  157. uint8_t type; // MsprofMixDataType
  158. uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES];
  159. union {
  160. uint64_t hashId;
  161. char dataStr[MSPROF_GE_OP_TYPE_LEN];
  162. } data;
  163. };
  164. struct MsprofGeProfTaskData {
  165. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  166. uint16_t dataTag = MSPROF_GE_DATA_TAG_TASK;
  167. uint32_t taskType; // MsprofGeTaskType
  168. MsprofMixData opName;
  169. MsprofGeOpType opType;
  170. uint64_t curIterNum;
  171. uint64_t timeStamp;
  172. uint32_t shapeType; // MsprofGeShapeType
  173. uint32_t blockDims;
  174. uint32_t modelId;
  175. uint32_t streamId;
  176. uint32_t taskId;
  177. uint32_t threadId;
  178. uint32_t contextId;
  179. uint8_t reserve[MSPROF_GE_TASK_DATA_RESERVE_BYTES];
  180. };
  181. #define MSPROF_GE_TENSOR_DATA_RESERVE_BYTES 8
  182. #define MSPROF_GE_TENSOR_DATA_SHAPE_LEN 8
  183. #define MSPROF_GE_TENSOR_DATA_NUM 5
  184. enum MsprofGeTensorType {
  185. MSPROF_GE_TENSOR_TYPE_INPUT = 0,
  186. MSPROF_GE_TENSOR_TYPE_OUTPUT,
  187. };
  188. struct MsprofGeTensorData {
  189. uint32_t tensorType; // MsprofGeTensorType
  190. uint32_t format;
  191. uint32_t dataType;
  192. uint32_t shape[MSPROF_GE_TENSOR_DATA_SHAPE_LEN];
  193. };
  194. struct MsprofGeProfTensorData {
  195. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  196. uint16_t dataTag = MSPROF_GE_DATA_TAG_TENSOR;
  197. uint32_t modelId;
  198. uint64_t curIterNum;
  199. uint32_t streamId;
  200. uint32_t taskId;
  201. uint32_t tensorNum;
  202. MsprofGeTensorData tensorData[MSPROF_GE_TENSOR_DATA_NUM];
  203. uint8_t reserve[MSPROF_GE_TENSOR_DATA_RESERVE_BYTES];
  204. };
  205. #define MSPROF_GE_STEP_DATA_RESERVE_BYTES 27
  206. enum MsprofGeStepTag {
  207. MSPROF_GE_STEP_TAG_BEGIN = 0,
  208. MSPROF_GE_STEP_TAG_END,
  209. };
  210. struct MsprofGeProfStepData {
  211. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  212. uint16_t dataTag = MSPROF_GE_DATA_TAG_STEP;
  213. uint32_t modelId;
  214. uint32_t streamId;
  215. uint32_t taskId;
  216. uint64_t timeStamp;
  217. uint64_t curIterNum;
  218. uint32_t threadId;
  219. uint8_t tag; // MsprofGeStepTag
  220. uint8_t reserve[MSPROF_GE_STEP_DATA_RESERVE_BYTES];
  221. };
  222. #define MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES 6
  223. struct MsprofGeProfIdMapData {
  224. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  225. uint16_t dataTag = MSPROF_GE_DATA_TAG_ID_MAP;
  226. uint32_t graphId;
  227. uint32_t modelId;
  228. uint32_t sessionId;
  229. uint64_t timeStamp;
  230. uint16_t mode;
  231. uint8_t reserve[MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES];
  232. };
  233. #define MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES 24
  234. struct MsprofGeProfHostSchData {
  235. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  236. uint16_t dataTag = MSPROF_GE_DATA_TAG_HOST_SCH;
  237. uint32_t threadId; // record in start event
  238. uint64_t element;
  239. uint64_t event;
  240. uint64_t startTime; // record in start event
  241. uint64_t endTime; // record in end event
  242. uint8_t reserve[MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES];
  243. };
  244. /**
  245. * @brief struct of data reported by RunTime
  246. */
  247. #define MSPROF_RUNTIME_API_DATA_RESERVE_BYTES 106
  248. #define MSPROF_RUNTIME_TASK_ID_NUM 10
  249. #define MSPROF_RUNTIME_API_NAME_LEN 64
  250. struct MsprofRuntimeProfApiData {
  251. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  252. uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_API;
  253. uint32_t threadId;
  254. uint64_t entryTime;
  255. uint64_t exitTime;
  256. uint64_t dataSize;
  257. uint8_t apiName[MSPROF_RUNTIME_API_NAME_LEN];
  258. uint32_t retCode;
  259. uint32_t streamId;
  260. uint32_t taskNum;
  261. uint32_t taskId[MSPROF_RUNTIME_TASK_ID_NUM];
  262. uint16_t memcpyDirection;
  263. uint8_t reserve[MSPROF_RUNTIME_API_DATA_RESERVE_BYTES];
  264. };
  265. #define MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES 10
  266. #define MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN 32
  267. struct MsprofRuntimeProfTrackData {
  268. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  269. uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_TRACK;
  270. uint32_t threadId;
  271. uint64_t timeStamp;
  272. char taskType[MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN];
  273. uint32_t taskId;
  274. uint16_t streamId;
  275. uint8_t reserve[MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES];
  276. };
  277. /**
  278. * @brief struct of data reported by RunTime
  279. */
  280. #define MSPROF_AICPU_DATA_RESERVE_BYTES 9
  281. struct MsprofAicpuProfData {
  282. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  283. uint16_t dataTag = MSPROF_AICPU_DATA_TAG;
  284. uint16_t streamId;
  285. uint16_t taskId;
  286. uint64_t runStartTime;
  287. uint64_t runStartTick;
  288. uint64_t computeStartTime;
  289. uint64_t memcpyStartTime;
  290. uint64_t memcpyEndTime;
  291. uint64_t runEndTime;
  292. uint64_t runEndTick;
  293. uint32_t threadId;
  294. uint32_t deviceId;
  295. uint64_t submitTick;
  296. uint64_t scheduleTick;
  297. uint64_t tickBeforeRun;
  298. uint64_t tickAfterRun;
  299. uint32_t kernelType;
  300. uint32_t dispatchTime;
  301. uint32_t totalTime;
  302. uint16_t fftsThreadId;
  303. uint8_t version;
  304. uint8_t reserve[MSPROF_AICPU_DATA_RESERVE_BYTES];
  305. };
  306. struct MsprofAicpuModelProfData {
  307. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  308. uint16_t dataTag = MSPROF_AICPU_MODEL_TAG;
  309. uint32_t rsv; // Ensure 8-byte alignment
  310. uint64_t timeStamp;
  311. uint64_t indexId;
  312. uint32_t modelId;
  313. uint16_t tagId;
  314. uint16_t rsv1;
  315. uint64_t eventId;
  316. uint8_t reserve[24];
  317. };
  318. /**
  319. * @brief struct of data reported by DP
  320. */
  321. #define MSPROF_DP_DATA_RESERVE_BYTES 16
  322. #define MSPROF_DP_DATA_ACTION_LEN 16
  323. #define MSPROF_DP_DATA_SOURCE_LEN 64
  324. struct MsprofDpProfData {
  325. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  326. uint16_t dataTag = MSPROF_DP_DATA_TAG;
  327. uint32_t rsv; // Ensure 8-byte alignment
  328. uint64_t timeStamp;
  329. char action[MSPROF_DP_DATA_ACTION_LEN];
  330. char source[MSPROF_DP_DATA_SOURCE_LEN];
  331. uint64_t index;
  332. uint64_t size;
  333. uint8_t reserve[MSPROF_DP_DATA_RESERVE_BYTES];
  334. };
  335. /**
  336. * @brief struct of data reported by HCCL
  337. */
  338. #pragma pack(4)
  339. struct MsprofHcclProfNotify {
  340. uint32_t taskID;
  341. uint64_t notifyID;
  342. uint32_t stage;
  343. uint32_t remoteRank;
  344. uint32_t transportType;
  345. uint32_t role; // role {0: dst, 1:src}
  346. double durationEstimated;
  347. };
  348. struct MsprofHcclProfReduce {
  349. uint32_t taskID;
  350. uint64_t src;
  351. uint64_t dst;
  352. uint64_t size;
  353. uint32_t op; // {0: sum, 1: mul, 2: max, 3: min}
  354. uint32_t dataType; // data type {0: INT8, 1: INT16, 2: INT32, 3: FP16, 4:FP32, 5:INT64, 6:UINT64}
  355. uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'}
  356. uint32_t remoteRank;
  357. uint32_t transportType; // transport type {0: SDMA, 1: RDMA, 2:LOCAL}
  358. uint32_t role; // role {0: dst, 1:src}
  359. double durationEstimated;
  360. };
  361. struct MsprofHcclProfRDMA {
  362. uint32_t taskID;
  363. uint64_t src;
  364. uint64_t dst;
  365. uint64_t size;
  366. uint64_t notifyID;
  367. uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'}
  368. uint32_t remoteRank;
  369. uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL}
  370. uint32_t role; // role {0: dst, 1:src}
  371. uint32_t type; // RDMA type {0: RDMASendNotify, 1:RDMASendPayload}
  372. double durationEstimated;
  373. };
  374. struct MsprofHcclProfMemcpy {
  375. uint32_t taskID;
  376. uint64_t src;
  377. uint64_t dst;
  378. uint64_t size;
  379. uint64_t notifyID;
  380. uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'}
  381. uint32_t remoteRank;
  382. uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL}
  383. uint32_t role; // role {0: dst, 1:src}
  384. double durationEstimated;
  385. };
  386. struct MsprofHcclProfStageStep {
  387. uint32_t rank;
  388. uint32_t rankSize;
  389. };
  390. struct MsprofHcclProfFlag {
  391. uint64_t cclTag;
  392. uint64_t groupName;
  393. uint32_t localRank;
  394. uint32_t workFlowMode;
  395. };
  396. /**
  397. * @name MsprofHcclProfData
  398. * @brief struct of data reported by hccl
  399. */
  400. struct MsprofHcclProfData {
  401. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  402. uint16_t dataTag = MSPROF_HCCL_DATA_TAG;
  403. uint32_t planeID;
  404. uint32_t deviceID;
  405. uint32_t streamID;
  406. double ts;
  407. char name[16];
  408. union {
  409. MsprofHcclProfNotify notify;
  410. MsprofHcclProfReduce reduce;
  411. MsprofHcclProfStageStep stageStep;
  412. MsprofHcclProfMemcpy forMemcpy;
  413. MsprofHcclProfRDMA RDMA;
  414. MsprofHcclProfFlag flag;
  415. } args;
  416. };
  417. #pragma pack()
  418. /**
  419. * @name MsprofStampInfo
  420. * @brief struct of data reported by msproftx
  421. */
  422. struct MsprofStampInfo {
  423. uint16_t magicNumber;
  424. uint16_t dataTag;
  425. uint32_t processId;
  426. uint32_t threadId;
  427. uint32_t category; // marker category
  428. uint32_t eventType;
  429. int32_t payloadType;
  430. union PayloadValue {
  431. uint64_t ullValue;
  432. int64_t llValue;
  433. double dValue;
  434. uint32_t uiValue[2];
  435. int32_t iValue[2];
  436. float fValue[2];
  437. } payload; // payload info for marker
  438. uint64_t startTime;
  439. uint64_t endTime;
  440. int32_t messageType;
  441. char message[128];
  442. uint8_t reserve0[4];
  443. uint8_t reserve1[72];
  444. };
  445. #ifdef __cplusplus
  446. }
  447. #endif
  448. #endif // MSPROFILER_PROF_COMMON_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示