You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rt_ffts.h 6.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. /*
  2. * Copyright (c) Huawei Technologies Co. , Ltd. 2021. All rights reserved.
  3. * Description: ffts interface
  4. */
  5. #ifndef __CCE_RUNTIME_FFTS_H
  6. #define __CCE_RUNTIME_FFTS_H
  7. #include "base.h"
  8. #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
  9. extern "C" {
  10. #endif
  11. #define RT_FFTS_MAX_SUB_TASK_NUM 32U
  12. #define RT_FFTS_MAX_TICKET_CACHE_NUM 64U
  13. #define RT_FFTS_MAX_MANUAL_THREAD_NUM 16U
  14. #define RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK 8U
  15. #define RT_FFTS_MANUAL_SRC_DEPEND_TBL_LEN 32U
  16. typedef enum tagFftsType {
  17. RT_FFTS_TYPE_AUTO_THREAD = 2, // ffts auto thread mode, same as ffts define
  18. RT_FFTS_TYPE_MANUAL_THREAD = 3, // ffts manual thread mode, same as ffts define
  19. } rtFftsType_t;
  20. typedef enum tagFftsSubTaskType {
  21. RT_FFTS_SUB_TASK_TYPE_AIC = 0,
  22. RT_FFTS_SUB_TASK_TYPE_AIV = 1,
  23. RT_FFTS_SUB_TASK_TYPE_NOP = 2,
  24. RT_FFTS_SUB_TASK_TYPE_NOTIFY_WAIT = 3,
  25. RT_FFTS_SUB_TASK_TYPE_NOTIFY_RECORD = 4,
  26. RT_FFTS_SUB_TASK_TYPE_WRITE_VALUE = 5,
  27. RT_FFTS_SUB_TASK_TYPE_MIX_AIC = 6,
  28. RT_FFTS_SUB_TASK_TYPE_MIX_AIV = 7,
  29. RT_FFTS_SUB_TASK_TYPE_SDMA = 8,
  30. RT_FFTS_SUB_TASK_TYPE_RESERVED,
  31. } rtFftsSubTaskType_t;
  32. typedef struct tagManualThreadDmuInfo {
  33. uint64_t dataAddr; // device mem
  34. uint16_t numOuter;
  35. uint16_t numInner;
  36. uint32_t strideOuter;
  37. uint32_t lenInner;
  38. uint32_t strideInner;
  39. } rtManualThreadDmuInfo_t;
  40. typedef struct tagManualThreadDependency {
  41. uint8_t dependency[RT_FFTS_MANUAL_SRC_DEPEND_TBL_LEN];
  42. } rtManualThreadDependency_t;
  43. typedef struct tagManualThreadAicAivInfo {
  44. uint64_t taskParamAddr; // device mem
  45. uint16_t taskParamOffset;
  46. // when satMode=1 and FP16 computation with none INF inputs overflows/underflows, results will be +/-INF of FP16
  47. // when satMode=0 and FP16 computation with none INF inputs overflows/underflows
  48. // results will be saturated to +/- MAX of FP16
  49. uint8_t satMode;
  50. uint8_t scheduleMode; // 0:normal mode, 1:batch mode, 2:sync mode, 3: reserved
  51. uint8_t iCachePrefetchCnt; // units is 2K
  52. uint8_t prefetchEnableBitmap; // 8 bit bitmap 1 0 1 0
  53. uint8_t prefetchOnceBitmap; // 8 bit bitmap 1 0 1 0
  54. uint16_t prefetchOnceDmuNum; // prefetch_once_dmu_descriptor_index in ffts
  55. // num: thread0_prefetch_dmu_descriptor_index - prefetch_once_dmu_descriptor_index
  56. uint16_t threadPrefetchDmuIdx[RT_FFTS_MAX_MANUAL_THREAD_NUM]; // max valid is threadDim
  57. uint16_t threadBlkDim[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  58. const char *threadTaskFuncStub[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  59. rtManualThreadDmuInfo_t *prefetchList; // dmu desc 0-64k, length is the last threadPrefetchDmuIdx[threadDim - 1]
  60. rtManualThreadDependency_t srcDepTbl[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  61. } rtManualThreadAicAivInfo_t;
  62. typedef struct tagAutoThreadPrefetch {
  63. uint64_t dataAddr; // device mem
  64. uint32_t dataAddrOffset;
  65. uint32_t nonTailDataLen;
  66. uint32_t tailDataLen;
  67. } rtAutoThreadPrefetch_t;
  68. typedef struct tagAutoThreadAicAivInfo {
  69. uint64_t taskParamAddr; // device mem
  70. uint16_t taskParamOffset;
  71. // when satMode=1 and FP16 computation with none INF inputs overflows/underflows, results will be +/-INF of FP16
  72. // when satMode=0 and FP16 computation with none INF inputs overflows/underflows
  73. // results will be saturated to +/- MAX of FP16
  74. uint8_t satMode;
  75. uint8_t scheduleMode; // 0:normal mode, 1:batch mode, 2:sync mode, 3: reserved
  76. uint8_t iCachePrefetchCnt; // units is 2K
  77. uint8_t prefetchEnableBitmap; // 8 bit bitmap
  78. uint8_t prefetchOnceBitmap; // 8 bit bitmap
  79. uint16_t tailBlkDim;
  80. uint16_t nonTailBlkDim;
  81. const char *nonTailTaskFuncStub;
  82. const char *tailTaskFuncStub;
  83. // for prefetch, valid num is prefetchEnableBitmap bit count
  84. // if prefetchEnableBitmap = '00010011', need prefetch number is 3, srcPrefetch is only 0, 1, 2 is valid
  85. rtAutoThreadPrefetch_t srcPrefetch[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  86. } rtAutoThreadAicAivInfo_t;
  87. typedef struct tagAutoThreadCacheInfo {
  88. uint64_t dataAddr; // device mem
  89. uint32_t dataAddrOffset;
  90. uint32_t nonTailDataLen;
  91. uint32_t tailDataLen;
  92. uint16_t ticketCacheRefCnt;
  93. } rtAutoThreadCacheInfo_t;
  94. typedef struct tagManualThreadCacheInfo {
  95. rtManualThreadDmuInfo_t *dmuList; // 0-64k
  96. uint16_t dmuNum;
  97. uint16_t sliceDmuIdx[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  98. uint16_t ticketCacheRefCntTbl[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  99. } rtManualThreadCacheInfo_t;
  100. typedef enum tagCacheOp {
  101. RT_CACHE_OP_NONE = 0,
  102. RT_CACHE_OP_FLUSH = 1,
  103. RT_CACHE_OP_INVALIDATE = 2,
  104. RT_CACHE_OP_WRITE_BACK = 3,
  105. } rtCacheOp_t;
  106. typedef struct tagTicketCache {
  107. rtCacheOp_t cacheOption;
  108. uint8_t ticketCacheWindow;
  109. union {
  110. rtAutoThreadCacheInfo_t autoThreadCache;
  111. rtManualThreadCacheInfo_t manualThreadCache;
  112. } custom;
  113. } rtTicketCache_t;
  114. typedef struct tagManualThreadNopInfo {
  115. // depend srcTickCacheVldBitmap in rtFftsSubTaskInfo_t
  116. rtManualThreadDependency_t srcDepTbl[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  117. } rtManualThreadNopInfo_t;
  118. typedef struct tagFftsSubTaskInfo {
  119. rtFftsSubTaskType_t subTaskType;
  120. uint16_t threadDim;
  121. uint8_t dstTickCacheVldBitmap;
  122. uint8_t srcTickCacheVldBitmap;
  123. uint8_t srcDataOutOfSubGraphBitmap;
  124. uint8_t dstTickCacheID[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  125. uint8_t srcTickCacheID[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  126. union {
  127. rtAutoThreadAicAivInfo_t autoThreadAicAiv;
  128. rtManualThreadAicAivInfo_t manualThreadAicAiv;
  129. rtManualThreadNopInfo_t manualThreadNop;
  130. } custom;
  131. } rtFftsSubTaskInfo_t;
  132. typedef struct tagFftsDescInfo {
  133. uint8_t tm; // thread subtask kickstart mode, 0:order, 1:disorder
  134. uint8_t di; // discard invalidate
  135. uint8_t dw; // discard write back
  136. uint8_t df; // discard flush
  137. uint8_t dataSplitUnit; // split source or ticket cache by 2~dataSplitUnit MB
  138. uint8_t prefetchOstNum;
  139. uint8_t cacheMaintainOstNum;
  140. uint8_t aicPrefetchUpper;
  141. uint8_t aicPrefetchLower;
  142. uint8_t aivPrefetchUpper;
  143. uint8_t aivPrefetchLower;
  144. } rtFftsDescInfo_t;
  145. typedef struct tagFftsTaskInfo {
  146. rtFftsType_t fftsType;
  147. uint16_t subTaskNum;
  148. uint16_t tickCacheNum;
  149. rtFftsDescInfo_t fftsDesc;
  150. // sub task desc, real num is subTaskNum
  151. rtFftsSubTaskInfo_t subTask[RT_FFTS_MAX_SUB_TASK_NUM];
  152. // ticket cache, real number is ticketCacheNum
  153. rtTicketCache_t ticketCache[RT_FFTS_MAX_TICKET_CACHE_NUM];
  154. } rtFftsTaskInfo_t;
  155. RTS_API rtError_t rtFftsTaskLaunch(rtFftsTaskInfo_t *fftsTaskInfo, rtStream_t stream);
  156. #if defined(__cplusplus) && !defined(COMPILE_OMG_PACKAGE)
  157. }
  158. #endif
  159. #endif //__CCE_RUNTIME_FFTS_H

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示