You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rt_ffts.h 6.8 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. /*
  2. * Copyright (c) Huawei Technologies Co., Ltd. 2021-2021. All rights reserved.
  3. * Description: ffts interface
  4. */
  5. #ifndef CCE_RUNTIME_RT_FFTS_H
  6. #define CCE_RUNTIME_RT_FFTS_H
  7. #include "base.h"
  8. #if defined(__cplusplus)
  9. extern "C" {
  10. #endif
  11. #define RT_FFTS_MAX_SUB_TASK_NUM 32U
  12. #define RT_FFTS_MAX_TICKET_CACHE_NUM 64U
  13. #define RT_FFTS_MAX_MANUAL_THREAD_NUM 16U
  14. #define RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK 8U
  15. #define RT_FFTS_MANUAL_SRC_DEPEND_TBL_LEN 32U
  16. typedef enum tagFftsType {
  17. RT_FFTS_TYPE_AUTO_THREAD = 2, // ffts auto thread mode, same as ffts define
  18. RT_FFTS_TYPE_MANUAL_THREAD = 3, // ffts manual thread mode, same as ffts define
  19. } rtFftsType_t;
  20. typedef enum tagFftsSubTaskType {
  21. RT_FFTS_SUB_TASK_TYPE_AIC = 0,
  22. RT_FFTS_SUB_TASK_TYPE_AIV = 1,
  23. RT_FFTS_SUB_TASK_TYPE_NOP = 2,
  24. RT_FFTS_SUB_TASK_TYPE_NOTIFY_WAIT = 3,
  25. RT_FFTS_SUB_TASK_TYPE_NOTIFY_RECORD = 4,
  26. RT_FFTS_SUB_TASK_TYPE_WRITE_VALUE = 5,
  27. RT_FFTS_SUB_TASK_TYPE_MIX_AIC = 6,
  28. RT_FFTS_SUB_TASK_TYPE_MIX_AIV = 7,
  29. RT_FFTS_SUB_TASK_TYPE_SDMA = 8,
  30. RT_FFTS_SUB_TASK_TYPE_RESERVED = 9,
  31. } rtFftsSubTaskType_t;
  32. typedef struct tagManualThreadDmuInfo {
  33. uint64_t dataAddr; // device mem
  34. uint16_t numOuter;
  35. uint16_t numInner;
  36. uint32_t strideOuter;
  37. uint32_t lenInner;
  38. uint32_t strideInner;
  39. } rtManualThreadDmuInfo_t;
  40. typedef struct tagManualThreadDependency {
  41. uint8_t dependency[RT_FFTS_MANUAL_SRC_DEPEND_TBL_LEN];
  42. } rtManualThreadDependency_t;
  43. typedef struct tagManualThreadAicAivInfo {
  44. uint64_t taskParamAddr; // device mem
  45. uint16_t taskParamOffset;
  46. // when satMode=1 and FP16 computation with none INF inputs overflows/underflows, results will be +/-INF of FP16
  47. // when satMode=0 and FP16 computation with none INF inputs overflows/underflows,
  48. // results will be saturated to +/-MAX of FP16
  49. uint8_t satMode;
  50. uint8_t scheduleMode; // 0:normal mode, 1:batch mode, 2:sync mode 3:reserved
  51. uint8_t iCachePrefetchCnt; // units is 2K
  52. uint8_t prefetchEnableBitmap; // 8 bit bitmap 1 0 1 0
  53. uint8_t prefetchOnceBitmap; // 8 bit bitmap 1 0 1 0
  54. uint16_t prefetchOnceDmuNum; // prefetch_once_dmu_descriptor_index in ffts
  55. // num: thread0_prefetch_dmu_descriptor_index – prefetch_once_dmu_descriptor_index
  56. uint16_t threadPrefetchDmuIdx[RT_FFTS_MAX_MANUAL_THREAD_NUM]; // max valid is threadDim
  57. uint16_t threadBlkDim[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  58. const char_t *threadTaskFuncStub[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  59. rtManualThreadDmuInfo_t *prefetchList; // dmu desc 0-64k, length is the last threadPrefetchDmuIdx[threadDim-1]
  60. rtManualThreadDependency_t srcDepTbl[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  61. } rtManualThreadAicAivInfo_t;
  62. typedef struct tagAutoThreadPrefetch {
  63. uint64_t dataAddr; // device mem
  64. uint32_t dataAddrOffset;
  65. uint32_t nonTailDataLen;
  66. uint32_t tailDataLen;
  67. } rtAutoThreadPrefetch_t;
  68. typedef struct tagAutoThreadAicAivInfo {
  69. uint64_t taskParamAddr; // device mem
  70. uint16_t taskParamOffset;
  71. /*
  72. * when satMode=1 and FP16 computation with none INF inputs overflows/underflows, results will be +/-INF of FP16
  73. * when satMode=0 and FP16 computation with none INF inputs overflows/underflows, results will be saturated to
  74. * +/-MAX of FP16
  75. */
  76. uint8_t satMode;
  77. uint8_t scheduleMode; // 0:normal mode, 1:batch mode, 2:sync mode 3:reserved
  78. uint8_t iCachePrefetchCnt; // units is 2K
  79. uint8_t prefetchEnableBitmap; // 8 bit bitmap
  80. uint8_t prefetchOnceBitmap; // 8 bit bitmap
  81. uint16_t tailBlkDim;
  82. uint16_t nonTailBlkDim;
  83. const char_t *nonTailTaskFuncStub;
  84. const char_t *tailTaskFuncStub;
  85. // for prefetch, valid num is prefetchEnableBitmap bit count.
  86. // if prefetchEnableBitmap='00010011', need prefetch number is 3, srcPrefetch is only 0, 1, 2 is valid
  87. rtAutoThreadPrefetch_t srcPrefetch[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  88. } rtAutoThreadAicAivInfo_t;
  89. typedef struct tagAutoThreadCacheInfo {
  90. uint64_t dataAddr; // device mem
  91. uint32_t dataAddrOffset;
  92. uint32_t nonTailDataLen;
  93. uint32_t tailDataLen;
  94. uint16_t ticketCacheRefCnt;
  95. } rtAutoThreadCacheInfo_t;
  96. typedef struct tagManualThreadCacheInfo {
  97. rtManualThreadDmuInfo_t *dmuList; // 0-64k
  98. uint16_t dmuNum;
  99. uint16_t sliceDmuIdx[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  100. uint16_t ticketCacheRefCntTbl[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  101. } rtManualThreadCacheInfo_t;
  102. typedef enum tagCacheOp {
  103. RT_CACHE_OP_NONE = 0,
  104. RT_CACHE_OP_FLUSH = 1,
  105. RT_CACHE_OP_INVALIDATE = 2,
  106. RT_CACHE_OP_WRITE_BACK = 3,
  107. } rtCacheOp_t;
  108. typedef struct tagTicketCache {
  109. rtCacheOp_t cacheOption;
  110. uint8_t ticketCacheWindow;
  111. union {
  112. rtAutoThreadCacheInfo_t autoThreadCache;
  113. rtManualThreadCacheInfo_t manualThreadCache;
  114. } custom;
  115. } rtTicketCache_t;
  116. typedef struct tagManualThreadNopInfo {
  117. // depend srcTickCacheVldBitmap in rtFftsSubTaskInfo_t
  118. rtManualThreadDependency_t srcDepTbl[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  119. } rtManualThreadNopInfo_t;
  120. typedef struct tagFftsSubTaskInfo {
  121. rtFftsSubTaskType_t subTaskType;
  122. uint16_t threadDim;
  123. uint8_t dstTickCacheVldBitmap;
  124. uint8_t srcTickCacheVldBitmap;
  125. uint8_t srcDataOutOfSubGraphBitmap;
  126. uint8_t dstTickCacheID[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  127. uint8_t srcTickCacheID[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  128. union {
  129. rtAutoThreadAicAivInfo_t autoThreadAicAiv;
  130. rtManualThreadAicAivInfo_t manualThreadAicAiv;
  131. rtManualThreadNopInfo_t manualThreadNop;
  132. } custom;
  133. } rtFftsSubTaskInfo_t;
  134. typedef struct tagFftsDescInfo {
  135. uint8_t tm; // thread subtask kickstart mode, 0:order, 1:disorder
  136. uint8_t di; // discard invalidate
  137. uint8_t dw; // discard write back
  138. uint8_t df; // discard flush
  139. uint8_t dataSplitUnit; // split source or ticket cache by 2^dataSplitUnit MB
  140. uint8_t prefetchOstNum;
  141. uint8_t cacheMaintainOstNum;
  142. uint8_t aicPrefetchUpper;
  143. uint8_t aicPrefetchLower;
  144. uint8_t aivPrefetchUpper;
  145. uint8_t aivPrefetchLower;
  146. } rtFftsDescInfo_t;
  147. typedef struct tagFftsTaskInfo {
  148. rtFftsType_t fftsType;
  149. uint16_t subTaskNum;
  150. uint16_t tickCacheNum;
  151. rtFftsDescInfo_t fftsDesc;
  152. // sub task desc, real num is subTaskNum
  153. rtFftsSubTaskInfo_t subTask[RT_FFTS_MAX_SUB_TASK_NUM];
  154. // ticket cache, real number is tickCacheNum.
  155. rtTicketCache_t ticketCache[RT_FFTS_MAX_TICKET_CACHE_NUM];
  156. } rtFftsTaskInfo_t;
  157. RTS_API rtError_t rtFftsTaskLaunch(rtFftsTaskInfo_t *fftsTaskInfo, rtStream_t stm);
  158. RTS_API rtError_t rtGetC2cCtrlAddr(uint64_t *addr, uint32_t *len);
  159. RTS_API rtError_t rtFftsTaskLaunchWithFlag(rtFftsTaskInfo_t *fftsTaskInfo, rtStream_t stm, uint32_t flag);
  160. #if defined(__cplusplus)
  161. }
  162. #endif
  163. #endif // CCE_RUNTIME_RT_FFTS_H

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示