You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rt_ffts.h 6.7 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. /*
  2. * Copyright (c) Huawei Technologies Co., Ltd. 2021. All rights reserved.
  3. * Description: ffts interface
  4. */
  5. #ifndef CCE_RUNTIME_RT_FFTS_H
  6. #define CCE_RUNTIME_RT_FFTS_H
  7. #include "base.h"
  8. #if defined(__cplusplus)
  9. extern "C" {
  10. #endif
  11. #define RT_FFTS_MAX_SUB_TASK_NUM 32U
  12. #define RT_FFTS_MAX_TICKET_CACHE_NUM 64U
  13. #define RT_FFTS_MAX_MANUAL_THREAD_NUM 16U
  14. #define RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK 8U
  15. #define RT_FFTS_MANUAL_SRC_DEPEND_TBL_LEN 32U
  16. typedef enum tagFftsType {
  17. RT_FFTS_TYPE_AUTO_THREAD = 2, // ffts auto thread mode, same as ffts define
  18. RT_FFTS_TYPE_MANUAL_THREAD = 3, // ffts manual thread mode, same as ffts define
  19. } rtFftsType_t;
  20. typedef enum tagFftsSubTaskType {
  21. RT_FFTS_SUB_TASK_TYPE_AIC = 0,
  22. RT_FFTS_SUB_TASK_TYPE_AIV = 1,
  23. RT_FFTS_SUB_TASK_TYPE_NOP = 2,
  24. RT_FFTS_SUB_TASK_TYPE_NOTIFY_WAIT = 3,
  25. RT_FFTS_SUB_TASK_TYPE_NOTIFY_RECORD = 4,
  26. RT_FFTS_SUB_TASK_TYPE_WRITE_VALUE = 5,
  27. RT_FFTS_SUB_TASK_TYPE_MIX_AIC = 6,
  28. RT_FFTS_SUB_TASK_TYPE_MIX_AIV = 7,
  29. RT_FFTS_SUB_TASK_TYPE_SDMA = 8,
  30. RT_FFTS_SUB_TASK_TYPE_RESERVED = 9,
  31. } rtFftsSubTaskType_t;
  32. typedef struct tagManualThreadDmuInfo {
  33. uint64_t dataAddr; // device mem
  34. uint16_t numOuter;
  35. uint16_t numInner;
  36. uint32_t strideOuter;
  37. uint32_t lenInner;
  38. uint32_t strideInner;
  39. } rtManualThreadDmuInfo_t;
  40. typedef struct tagManualThreadDependency {
  41. uint8_t dependency[RT_FFTS_MANUAL_SRC_DEPEND_TBL_LEN];
  42. } rtManualThreadDependency_t;
  43. typedef struct tagManualThreadAicAivInfo {
  44. uint64_t taskParamAddr; // device mem
  45. uint16_t taskParamOffset;
  46. // when satMode=1 and FP16 computation with none INF inputs overflows/underflows, results will be +/-INF of FP16
  47. // when satMode=0 and FP16 computation with none INF inputs overflows/underflows,
  48. // results will be saturated to +/-MAX of FP16
  49. uint8_t satMode;
  50. uint8_t scheduleMode; // 0:normal mode, 1:batch mode, 2:sync mode 3:reserved
  51. uint8_t iCachePrefetchCnt; // units is 2K
  52. uint8_t prefetchEnableBitmap; // 8 bit bitmap 1 0 1 0
  53. uint8_t prefetchOnceBitmap; // 8 bit bitmap 1 0 1 0
  54. uint16_t prefetchOnceDmuNum; // prefetch_once_dmu_descriptor_index in ffts
  55. // num: thread0_prefetch_dmu_descriptor_index – prefetch_once_dmu_descriptor_index
  56. uint16_t threadPrefetchDmuIdx[RT_FFTS_MAX_MANUAL_THREAD_NUM]; // max valid is threadDim
  57. uint16_t threadBlkDim[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  58. const char *threadTaskFuncStub[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  59. rtManualThreadDmuInfo_t *prefetchList; // dmu desc 0-64k, length is the last threadPrefetchDmuIdx[threadDim-1]
  60. rtManualThreadDependency_t srcDepTbl[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  61. } rtManualThreadAicAivInfo_t;
  62. typedef struct tagAutoThreadPrefetch {
  63. uint64_t dataAddr; // device mem
  64. uint32_t dataAddrOffset;
  65. uint32_t nonTailDataLen;
  66. uint32_t tailDataLen;
  67. } rtAutoThreadPrefetch_t;
  68. typedef struct tagAutoThreadAicAivInfo {
  69. uint64_t taskParamAddr; // device mem
  70. uint16_t taskParamOffset;
  71. // when satMode=1 and FP16 computation with none INF inputs overflows/underflows, results will be +/-INF of FP16
  72. // when satMode=0 and FP16 computation with none INF inputs overflows/underflows, results will be saturated to +/-MAX of FP16
  73. uint8_t satMode;
  74. uint8_t scheduleMode; // 0:normal mode, 1:batch mode, 2:sync mode 3:reserved
  75. uint8_t iCachePrefetchCnt; // units is 2K
  76. uint8_t prefetchEnableBitmap; // 8 bit bitmap
  77. uint8_t prefetchOnceBitmap; // 8 bit bitmap
  78. uint16_t tailBlkDim;
  79. uint16_t nonTailBlkDim;
  80. const char *nonTailTaskFuncStub;
  81. const char *tailTaskFuncStub;
  82. // for prefetch, valid num is prefetchEnableBitmap bit count.
  83. // if prefetchEnableBitmap='00010011', need prefetch number is 3, srcPrefetch is only 0, 1, 2 is valid
  84. rtAutoThreadPrefetch_t srcPrefetch[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  85. } rtAutoThreadAicAivInfo_t;
  86. typedef struct tagAutoThreadCacheInfo {
  87. uint64_t dataAddr; // device mem
  88. uint32_t dataAddrOffset;
  89. uint32_t nonTailDataLen;
  90. uint32_t tailDataLen;
  91. uint16_t ticketCacheRefCnt;
  92. } rtAutoThreadCacheInfo_t;
  93. typedef struct tagManualThreadCacheInfo {
  94. rtManualThreadDmuInfo_t *dmuList; // 0-64k
  95. uint16_t dmuNum;
  96. uint16_t sliceDmuIdx[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  97. uint16_t ticketCacheRefCntTbl[RT_FFTS_MAX_MANUAL_THREAD_NUM];
  98. } rtManualThreadCacheInfo_t;
  99. typedef enum tagCacheOp {
  100. RT_CACHE_OP_NONE = 0,
  101. RT_CACHE_OP_FLUSH = 1,
  102. RT_CACHE_OP_INVALIDATE = 2,
  103. RT_CACHE_OP_WRITE_BACK = 3,
  104. } rtCacheOp_t;
  105. typedef struct tagTicketCache {
  106. rtCacheOp_t cacheOption;
  107. uint8_t ticketCacheWindow;
  108. union {
  109. rtAutoThreadCacheInfo_t autoThreadCache;
  110. rtManualThreadCacheInfo_t manualThreadCache;
  111. } custom;
  112. } rtTicketCache_t;
  113. typedef struct tagManualThreadNopInfo {
  114. // depend srcTickCacheVldBitmap in rtFftsSubTaskInfo_t
  115. rtManualThreadDependency_t srcDepTbl[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  116. } rtManualThreadNopInfo_t;
  117. typedef struct tagFftsSubTaskInfo {
  118. rtFftsSubTaskType_t subTaskType;
  119. uint16_t threadDim;
  120. uint8_t dstTickCacheVldBitmap;
  121. uint8_t srcTickCacheVldBitmap;
  122. uint8_t srcDataOutOfSubGraphBitmap;
  123. uint8_t dstTickCacheID[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  124. uint8_t srcTickCacheID[RT_FFTS_MAX_TICKET_CACHE_PER_SUBTASK];
  125. union {
  126. rtAutoThreadAicAivInfo_t autoThreadAicAiv;
  127. rtManualThreadAicAivInfo_t manualThreadAicAiv;
  128. rtManualThreadNopInfo_t manualThreadNop;
  129. } custom;
  130. } rtFftsSubTaskInfo_t;
  131. typedef struct tagFftsDescInfo {
  132. uint8_t tm; // thread subtask kickstart mode, 0:order, 1:disorder
  133. uint8_t di; // discard invalidate
  134. uint8_t dw; // discard write back
  135. uint8_t df; // discard flush
  136. uint8_t dataSplitUnit; // split source or ticket cache by 2^dataSplitUnit MB
  137. uint8_t prefetchOstNum;
  138. uint8_t cacheMaintainOstNum;
  139. uint8_t aicPrefetchUpper;
  140. uint8_t aicPrefetchLower;
  141. uint8_t aivPrefetchUpper;
  142. uint8_t aivPrefetchLower;
  143. } rtFftsDescInfo_t;
  144. typedef struct tagFftsTaskInfo {
  145. rtFftsType_t fftsType;
  146. uint16_t subTaskNum;
  147. uint16_t tickCacheNum;
  148. rtFftsDescInfo_t fftsDesc;
  149. // sub task desc, real num is subTaskNum
  150. rtFftsSubTaskInfo_t subTask[RT_FFTS_MAX_SUB_TASK_NUM];
  151. // ticket cache, real number is tickCacheNum.
  152. rtTicketCache_t ticketCache[RT_FFTS_MAX_TICKET_CACHE_NUM];
  153. } rtFftsTaskInfo_t;
  154. RTS_API rtError_t rtFftsTaskLaunch(rtFftsTaskInfo_t *fftsTaskInfo, rtStream_t stream);
  155. RTS_API rtError_t rtFftsTaskLaunchWithFlag(rtFftsTaskInfo_t *fftsTaskInfo, rtStream_t stream, uint32_t flag);
  156. #if defined(__cplusplus)
  157. }
  158. #endif
  159. #endif // CCE_RUNTIME_RT_FFTS_H

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示