You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

prof_common.h 14 kB


  1. /*
  2. * Copyright (c) Huawei Technologies Co., Ltd. 2019-2021. All rights reserved.
  3. * Description: handle perf data
  4. * Author: Huawei Technologies Co., Ltd.
  5. * Create: 2019-10-13
  6. */
  7. #ifndef MSPROFILER_PROF_COMMON_H_
  8. #define MSPROFILER_PROF_COMMON_H_
  9. #ifdef __cplusplus
  10. extern "C" {
  11. #endif // __cplusplus
  12. #include <stdint.h>
  13. #define MSPROF_DATA_HEAD_MAGIC_NUM 0x5a5a
  14. enum MsprofDataTag {
  15. MSPROF_ACL_DATA_TAG = 0, //acl data tag, range: 0~19
  16. MSPROF_GE_DATA_TAG_MODEL_LOAD = 20, //ge data tag, range: 20~39
  17. MSPROF_GE_DATA_TAG_FUSION = 21,
  18. MSPROF_GE_DATA_TAG_INFER = 22,
  19. MSPROF_GE_DATA_TAG_TASK = 23,
  20. MSPROF_GE_DATA_TAG_TENSOR = 24,
  21. MSPROF_GE_DATA_TAG_STEP = 25,
  22. MSPROF_GE_DATA_TAG_ID_MAP = 26,
  23. MSPROF_GE_DATA_TAG_HOST_SCH = 27,
  24. MSPROF_RUNTIME_DATA_TAG_API = 40, //runtime data tag, range: 40~59
  25. MSPROF_RUNTIME_DATA_TAG_TRACK = 41,
  26. MSPROF_AICPU_DATA_TAG = 60, //aicpu data tag, range: 60~79
  27. MSPROF_AICPU_MODEL_TAG = 61,
  28. MSPROF_HCCL_DATA_TAG = 80, //hccl data tag, range: 80~99
  29. MSPROF_DP_DATA_TAG = 100, //dp data tag, range: 100~119
  30. MSPROF_MSPROFTX_DATA_TAG = 120, //hccl data tag, range: 120~139
  31. MSPROF_DATA_TAG_MAX = 65536, //data tag value type is uint16_t
  32. };
  33. /**
  34. * @brief struct of mixed data
  35. */
  36. #define MSPROF_MIX_DATA_RESERVE_BYTES 7
  37. #define MSPROF_MIX_DATA_STRING_LEN 120
  38. enum MsprofMixDataType {
  39. MSPROF_MIX_DATA_HASH_ID = 0,
  40. MSPROF_MIX_DATA_STRING,
  41. };
  42. struct MsprofMixData {
  43. uint8_t type; // MsprofMixDataType
  44. uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES];
  45. union {
  46. uint64_t hashId;
  47. char dataStr[MSPROF_MIX_DATA_STRING_LEN];
  48. } data;
  49. };
  50. #define PATH_LEN_MAX 1023
  51. #define PARAM_LEN_MAX 4095
  52. struct MsprofCommandHandleParams {
  53. uint32_t pathLen;
  54. uint32_t storageLimit; // MB
  55. uint32_t profDataLen;
  56. char path[PATH_LEN_MAX + 1];
  57. char profData[PARAM_LEN_MAX + 1];
  58. };
  59. /**
  60. * @brief profiling command info
  61. */
  62. #define MSPROF_MAX_DEV_NUM 64
  63. struct MsprofCommandHandle {
  64. uint64_t profSwitch;
  65. uint64_t profSwitchHi;
  66. uint32_t devNums;
  67. uint32_t devIdList[MSPROF_MAX_DEV_NUM];
  68. uint32_t modelId;
  69. uint32_t type;
  70. struct MsprofCommandHandleParams params;
  71. };
  72. /**
  73. * @brief struct of data reported by acl
  74. */
  75. #define MSPROF_ACL_DATA_RESERVE_BYTES 32
  76. #define MSPROF_ACL_API_NAME_LEN 64
  77. enum MsprofAclApiType {
  78. MSPROF_ACL_API_TYPE_OP = 1,
  79. MSPROF_ACL_API_TYPE_MODEL,
  80. MSPROF_ACL_API_TYPE_RUNTIME,
  81. MSPROF_ACL_API_TYPE_OTHERS,
  82. };
  83. struct MsprofAclProfData {
  84. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  85. uint16_t dataTag = MSPROF_ACL_DATA_TAG;
  86. uint32_t apiType; // enum MsprofAclApiType
  87. uint64_t beginTime;
  88. uint64_t endTime;
  89. uint32_t processId;
  90. uint32_t threadId;
  91. char apiName[MSPROF_ACL_API_NAME_LEN];
  92. uint8_t reserve[MSPROF_ACL_DATA_RESERVE_BYTES];
  93. };
  94. /**
  95. * @brief struct of data reported by GE
  96. */
  97. #define MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES 104
  98. struct MsprofGeProfModelLoadData {
  99. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  100. uint16_t dataTag = MSPROF_GE_DATA_TAG_MODEL_LOAD;
  101. uint32_t modelId;
  102. MsprofMixData modelName;
  103. uint64_t startTime;
  104. uint64_t endTime;
  105. uint8_t reserve[MSPROF_GE_MODELLOAD_DATA_RESERVE_BYTES];
  106. };
  107. #define MSPROF_GE_FUSION_DATA_RESERVE_BYTES 8
  108. #define MSPROF_GE_FUSION_OP_NUM 8
  109. struct MsprofGeProfFusionData {
  110. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  111. uint16_t dataTag = MSPROF_GE_DATA_TAG_FUSION;
  112. uint32_t modelId;
  113. MsprofMixData fusionName;
  114. uint64_t inputMemSize;
  115. uint64_t outputMemSize;
  116. uint64_t weightMemSize;
  117. uint64_t workspaceMemSize;
  118. uint64_t totalMemSize;
  119. uint64_t fusionOpNum;
  120. uint64_t fusionOp[MSPROF_GE_FUSION_OP_NUM];
  121. uint8_t reserve[MSPROF_GE_FUSION_DATA_RESERVE_BYTES];
  122. };
  123. #define MSPROF_GE_INFER_DATA_RESERVE_BYTES 64
  124. struct MsprofGeProfInferData {
  125. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  126. uint16_t dataTag = MSPROF_GE_DATA_TAG_INFER;
  127. uint32_t modelId;
  128. MsprofMixData modelName;
  129. uint32_t requestId;
  130. uint32_t threadId;
  131. uint64_t inputDataStartTime;
  132. uint64_t inputDataEndTime;
  133. uint64_t inferStartTime;
  134. uint64_t inferEndTime;
  135. uint64_t outputDataStartTime;
  136. uint64_t outputDataEndTime;
  137. uint8_t reserve[MSPROF_GE_INFER_DATA_RESERVE_BYTES];
  138. };
  139. #define MSPROF_GE_TASK_DATA_RESERVE_BYTES 12
  140. #define MSPROF_GE_OP_TYPE_LEN 56
  141. enum MsprofGeTaskType {
  142. MSPROF_GE_TASK_TYPE_AI_CORE = 0,
  143. MSPROF_GE_TASK_TYPE_AI_CPU,
  144. MSPROF_GE_TASK_TYPE_AIV,
  145. };
  146. enum MsprofGeShapeType {
  147. MSPROF_GE_SHAPE_TYPE_STATIC = 0,
  148. MSPROF_GE_SHAPE_TYPE_DYNAMIC,
  149. };
  150. struct MsprofGeOpType {
  151. uint8_t type; // MsprofMixDataType
  152. uint8_t rsv[MSPROF_MIX_DATA_RESERVE_BYTES];
  153. union {
  154. uint64_t hashId;
  155. char dataStr[MSPROF_GE_OP_TYPE_LEN];
  156. } data;
  157. };
  158. struct MsprofGeProfTaskData {
  159. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  160. uint16_t dataTag = MSPROF_GE_DATA_TAG_TASK;
  161. uint32_t taskType; // MsprofGeTaskType
  162. MsprofMixData opName;
  163. MsprofGeOpType opType;
  164. uint64_t curIterNum;
  165. uint64_t timeStamp;
  166. uint32_t shapeType; // MsprofGeShapeType
  167. uint32_t blockDims;
  168. uint32_t modelId;
  169. uint32_t streamId;
  170. uint32_t taskId;
  171. uint32_t threadId;
  172. uint32_t contextId;
  173. uint8_t reserve[MSPROF_GE_TASK_DATA_RESERVE_BYTES];
  174. };
  175. #define MSPROF_GE_TENSOR_DATA_RESERVE_BYTES 8
  176. #define MSPROF_GE_TENSOR_DATA_SHAPE_LEN 8
  177. #define MSPROF_GE_TENSOR_DATA_NUM 5
  178. enum MsprofGeTensorType {
  179. MSPROF_GE_TENSOR_TYPE_INPUT = 0,
  180. MSPROF_GE_TENSOR_TYPE_OUTPUT,
  181. };
  182. struct MsprofGeTensorData {
  183. uint32_t tensorType; // MsprofGeTensorType
  184. uint32_t format;
  185. uint32_t dataType;
  186. uint32_t shape[MSPROF_GE_TENSOR_DATA_SHAPE_LEN];
  187. };
  188. struct MsprofGeProfTensorData {
  189. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  190. uint16_t dataTag = MSPROF_GE_DATA_TAG_TENSOR;
  191. uint32_t modelId;
  192. uint64_t curIterNum;
  193. uint32_t streamId;
  194. uint32_t taskId;
  195. uint32_t tensorNum;
  196. MsprofGeTensorData tensorData[MSPROF_GE_TENSOR_DATA_NUM];
  197. uint8_t reserve[MSPROF_GE_TENSOR_DATA_RESERVE_BYTES];
  198. };
  199. #define MSPROF_GE_STEP_DATA_RESERVE_BYTES 27
  200. enum MsprofGeStepTag {
  201. MSPROF_GE_STEP_TAG_BEGIN = 0,
  202. MSPROF_GE_STEP_TAG_END,
  203. };
  204. struct MsprofGeProfStepData {
  205. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  206. uint16_t dataTag = MSPROF_GE_DATA_TAG_STEP;
  207. uint32_t modelId;
  208. uint32_t streamId;
  209. uint32_t taskId;
  210. uint64_t timeStamp;
  211. uint64_t curIterNum;
  212. uint32_t threadId;
  213. uint8_t tag; // MsprofGeStepTag
  214. uint8_t reserve[MSPROF_GE_STEP_DATA_RESERVE_BYTES];
  215. };
  216. #define MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES 6
  217. struct MsprofGeProfIdMapData {
  218. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  219. uint16_t dataTag = MSPROF_GE_DATA_TAG_ID_MAP;
  220. uint32_t graphId;
  221. uint32_t modelId;
  222. uint32_t sessionId;
  223. uint64_t timeStamp;
  224. uint16_t mode;
  225. uint8_t reserve[MSPROF_GE_ID_MAP_DATA_RESERVE_BYTES];
  226. };
  227. #define MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES 24
  228. struct MsprofGeProfHostSchData {
  229. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  230. uint16_t dataTag = MSPROF_GE_DATA_TAG_HOST_SCH;
  231. uint32_t threadId; // record in start event
  232. uint64_t element;
  233. uint64_t event;
  234. uint64_t startTime; // record in start event
  235. uint64_t endTime; // record in end event
  236. uint8_t reserve[MSPROF_GE_HOST_SCH_DATA_RESERVE_BYTES];
  237. };
  238. /**
  239. * @brief struct of data reported by RunTime
  240. */
  241. #define MSPROF_RUNTIME_API_DATA_RESERVE_BYTES 106
  242. #define MSPROF_RUNTIME_TASK_ID_NUM 10
  243. #define MSPROF_RUNTIME_API_NAME_LEN 64
  244. struct MsprofRuntimeProfApiData {
  245. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  246. uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_API;
  247. uint32_t threadId;
  248. uint64_t entryTime;
  249. uint64_t exitTime;
  250. uint64_t dataSize;
  251. uint8_t apiName[MSPROF_RUNTIME_API_NAME_LEN];
  252. uint32_t retCode;
  253. uint32_t streamId;
  254. uint32_t taskNum;
  255. uint32_t taskId[MSPROF_RUNTIME_TASK_ID_NUM];
  256. uint16_t memcpyDirection;
  257. uint8_t reserve[MSPROF_RUNTIME_API_DATA_RESERVE_BYTES];
  258. };
  259. #define MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES 10
  260. #define MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN 32
  261. struct MsprofRuntimeProfTrackData {
  262. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  263. uint16_t dataTag = MSPROF_RUNTIME_DATA_TAG_TRACK;
  264. uint32_t threadId;
  265. uint64_t timeStamp;
  266. char taskType[MSPROF_RUNTIME_TRACK_TASK_TYPE_LEN];
  267. uint32_t taskId;
  268. uint16_t streamId;
  269. uint8_t reserve[MSPROF_RUNTIME_TRACK_DATA_RESERVE_BYTES];
  270. };
  271. /**
  272. * @brief struct of data reported by RunTime
  273. */
  274. #define MSPROF_AICPU_DATA_RESERVE_BYTES 9
  275. struct MsprofAicpuProfData {
  276. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  277. uint16_t dataTag = MSPROF_AICPU_DATA_TAG;
  278. uint16_t streamId;
  279. uint16_t taskId;
  280. uint64_t runStartTime;
  281. uint64_t runStartTick;
  282. uint64_t computeStartTime;
  283. uint64_t memcpyStartTime;
  284. uint64_t memcpyEndTime;
  285. uint64_t runEndTime;
  286. uint64_t runEndTick;
  287. uint32_t threadId;
  288. uint32_t deviceId;
  289. uint64_t submitTick;
  290. uint64_t scheduleTick;
  291. uint64_t tickBeforeRun;
  292. uint64_t tickAfterRun;
  293. uint32_t kernelType;
  294. uint32_t dispatchTime;
  295. uint32_t totalTime;
  296. uint16_t fftsThreadId;
  297. uint8_t version;
  298. uint8_t reserve[MSPROF_AICPU_DATA_RESERVE_BYTES];
  299. };
  300. struct MsprofAicpuModelProfData {
  301. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  302. uint16_t dataTag = MSPROF_AICPU_MODEL_TAG;
  303. uint32_t rsv; // Ensure 8-byte alignment
  304. uint64_t timeStamp;
  305. uint64_t indexId;
  306. uint32_t modelId;
  307. uint16_t tagId;
  308. uint16_t rsv1;
  309. uint64_t eventId;
  310. uint8_t reserve[24];
  311. };
  312. /**
  313. * @brief struct of data reported by DP
  314. */
  315. #define MSPROF_DP_DATA_RESERVE_BYTES 16
  316. #define MSPROF_DP_DATA_ACTION_LEN 16
  317. #define MSPROF_DP_DATA_SOURCE_LEN 64
  318. struct MsprofDpProfData {
  319. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  320. uint16_t dataTag = MSPROF_DP_DATA_TAG;
  321. uint32_t rsv; // Ensure 8-byte alignment
  322. uint64_t timeStamp;
  323. char action[MSPROF_DP_DATA_ACTION_LEN];
  324. char source[MSPROF_DP_DATA_SOURCE_LEN];
  325. uint64_t index;
  326. uint64_t size;
  327. uint8_t reserve[MSPROF_DP_DATA_RESERVE_BYTES];
  328. };
  329. /**
  330. * @brief struct of data reported by HCCL
  331. */
  332. #pragma pack(4)
  333. struct MsprofHcclProfNotify {
  334. uint32_t taskID;
  335. uint64_t notifyID;
  336. uint32_t stage;
  337. uint32_t remoteRank;
  338. uint32_t transportType;
  339. uint32_t role; // role {0: dst, 1:src}
  340. double durationEstimated;
  341. };
  342. struct MsprofHcclProfReduce {
  343. uint32_t taskID;
  344. uint64_t src;
  345. uint64_t dst;
  346. uint64_t size;
  347. uint32_t op; // {0: sum, 1: mul, 2: max, 3: min}
  348. uint32_t dataType; // data type {0: INT8, 1: INT16, 2: INT32, 3: FP16, 4:FP32, 5:INT64, 6:UINT64}
  349. uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'}
  350. uint32_t remoteRank;
  351. uint32_t transportType; // transport type {0: SDMA, 1: RDMA, 2:LOCAL}
  352. uint32_t role; // role {0: dst, 1:src}
  353. double durationEstimated;
  354. };
  355. struct MsprofHcclProfRDMA {
  356. uint32_t taskID;
  357. uint64_t src;
  358. uint64_t dst;
  359. uint64_t size;
  360. uint64_t notifyID;
  361. uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'}
  362. uint32_t remoteRank;
  363. uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL}
  364. uint32_t role; // role {0: dst, 1:src}
  365. uint32_t type; // RDMA type {0: RDMASendNotify, 1:RDMASendPayload}
  366. double durationEstimated;
  367. };
  368. struct MsprofHcclProfMemcpy {
  369. uint32_t taskID;
  370. uint64_t src;
  371. uint64_t dst;
  372. uint64_t size;
  373. uint64_t notifyID;
  374. uint32_t linkType; // link type {0: 'OnChip', 1: 'HCCS', 2: 'PCIe', 3: 'RoCE'}
  375. uint32_t remoteRank;
  376. uint32_t transportType; // transport type {0: RDMA, 1:SDMA, 2:LOCAL}
  377. uint32_t role; // role {0: dst, 1:src}
  378. double durationEstimated;
  379. };
  380. struct MsprofHcclProfStageStep {
  381. uint32_t rank;
  382. uint32_t rankSize;
  383. };
  384. struct MsprofHcclProfFlag {
  385. uint64_t cclTag;
  386. uint64_t groupName;
  387. uint32_t localRank;
  388. uint32_t workFlowMode;
  389. };
  390. /**
  391. * @name MsprofHcclProfData
  392. * @brief struct of data reported by hccl
  393. */
  394. struct MsprofHcclProfData {
  395. uint16_t magicNumber = MSPROF_DATA_HEAD_MAGIC_NUM;
  396. uint16_t dataTag = MSPROF_HCCL_DATA_TAG;
  397. uint32_t planeID;
  398. uint32_t deviceID;
  399. uint32_t streamID;
  400. double ts;
  401. char name[16];
  402. union {
  403. MsprofHcclProfNotify notify;
  404. MsprofHcclProfReduce reduce;
  405. MsprofHcclProfStageStep stageStep;
  406. MsprofHcclProfMemcpy forMemcpy;
  407. MsprofHcclProfRDMA RDMA;
  408. MsprofHcclProfFlag flag;
  409. } args;
  410. };
  411. #pragma pack()
  412. /**
  413. * @name MsprofStampInfo
  414. * @brief struct of data reported by msproftx
  415. */
  416. struct MsprofStampInfo {
  417. uint16_t magicNumber;
  418. uint16_t dataTag;
  419. uint32_t processId;
  420. uint32_t threadId;
  421. uint32_t category; //marker category
  422. uint32_t eventType;
  423. int32_t payloadType;
  424. union PayloadValue //payload info for marker
  425. {
  426. uint64_t ullValue;
  427. int64_t llValue;
  428. double dValue;
  429. uint32_t uiValue[2];
  430. int32_t iValue[2];
  431. float fValue[2];
  432. } payload;
  433. uint64_t startTime;
  434. uint64_t endTime;
  435. int32_t messageType;
  436. char message[128];
  437. uint8_t reserve0[4];
  438. uint8_t reserve1[72];
  439. };
  440. #ifdef __cplusplus
  441. }
  442. #endif
  443. #endif // MSPROFILER_PROF_COMMON_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示