You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

common.h 98 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183
  1. /**
  2. * \file dnn/src/aarch64/matrix_mul/asm/common.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include <cmath>
  14. #include <cstdint>
  15. #include <type_traits>
  16. #include "src/arm_common/simd_macro/marm_neon.h"
  17. #include "src/common/utils.h"
  18. #include "src/fallback/conv_bias/common.h"
  19. namespace megdnn {
  20. namespace aarch64 {
  21. /* ======================== Prefetch ======================== */
  22. #define ASM_PREFETCH(address) "PRFM PLDL1KEEP, " address "\n"
  23. #define ASM_PREFETCHL2(address) "PRFM PLDL2KEEP, " address "\n"
  24. #define ASM_PREFETCHW(address) "PRFM PSTL1KEEP, " address "\n"
  25. #define ASM_PREFETCHWL2(address) "PRFM PSTL2KEEP, " address "\n"
  26. static inline void prefetch_6x(const void* pfp) {
  27. // clang-format off
  28. asm volatile(ASM_PREFETCH("[%[pfp]]")
  29. ASM_PREFETCH("[%[pfp], #64]")
  30. ASM_PREFETCH("[%[pfp], #128]")
  31. ASM_PREFETCH("[%[pfp], #192]")
  32. ASM_PREFETCH("[%[pfp], #256]")
  33. ASM_PREFETCH("[%[pfp], #320]")
  34. :
  35. : [pfp] "r"(pfp)
  36. : "memory");
  37. // clang-format on
  38. }
  39. static inline void prefetch_5x(const void* pfp) {
  40. // clang-format off
  41. asm volatile(ASM_PREFETCH("[%[pfp]]")
  42. ASM_PREFETCH("[%[pfp], #64]")
  43. ASM_PREFETCH("[%[pfp], #128]")
  44. ASM_PREFETCH("[%[pfp], #192]")
  45. ASM_PREFETCH("[%[pfp], #256]")
  46. :
  47. : [pfp] "r"(pfp)
  48. : "memory");
  49. // clang-format on
  50. }
  51. static inline void prefetch_4x(const void* pfp) {
  52. // clang-format off
  53. asm volatile(ASM_PREFETCH("[%[pfp]]")
  54. ASM_PREFETCH("[%[pfp], #64]")
  55. ASM_PREFETCH("[%[pfp], #128]")
  56. ASM_PREFETCH("[%[pfp], #192]")
  57. :
  58. : [pfp] "r"(pfp)
  59. : "memory");
  60. // clang-format on
  61. }
  62. static inline void prefetch_3x(const void* pfp) {
  63. // clang-format off
  64. asm volatile(ASM_PREFETCH("[%[pfp]]")
  65. ASM_PREFETCH("[%[pfp], #64]")
  66. ASM_PREFETCH("[%[pfp], #128]")
  67. :
  68. : [pfp] "r"(pfp)
  69. : "memory");
  70. // clang-format on
  71. }
  72. static inline void prefetch_2x(const void* pfp) {
  73. // clang-format off
  74. asm volatile(ASM_PREFETCH("[%[pfp]]")
  75. ASM_PREFETCH("[%[pfp], #64]")
  76. :
  77. : [pfp] "r"(pfp)
  78. : "memory");
  79. // clang-format on
  80. }
  81. static inline void prefetch_1x(const void* pfp) {
  82. // clang-format off
  83. asm volatile(ASM_PREFETCH("[%[pfp]]") : : [pfp] "r"(pfp) : "memory");
  84. // clang-format on
  85. }
  86. /* ======================== interleave pack A ======================== */
  87. /**
  88. * interleave_INTERLEAVE_UNROLLK_BATCH_type
  89. *
  90. * BATCH means process BATCH * UNROLL_K cols once, BATCH * sizeof(TYPE) *
  91. * UNROLL_K = 16bytes(128bits, a vector size).
  92. *
  93. * the elements traverse order:
  94. * rep(j, 0, INTERLEAVE) rep(i, 0, UNROLL_K) *ouptr++ = inptr[j, i]
  95. */
  96. template <typename T>
  97. static inline void interleave_24x1_8_h_helper(
  98. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  99. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  100. T*& outptr, int skippf = 0) {
  101. static_assert(sizeof(T) == 2, "only support size == 2");
  102. asm volatile(
  103. // Load up 8 elements (1 vector) from each of 8 sources.
  104. "cbnz %w[skippf], 1f\n"
  105. ASM_PREFETCH("[%[inptr0], #128]")
  106. ASM_PREFETCH("[%[inptr1], #128]")
  107. ASM_PREFETCH("[%[inptr2], #128]")
  108. ASM_PREFETCH("[%[inptr3], #128]")
  109. "1:\n"
  110. "ldr q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
  111. "ldr q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
  112. "ldr q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
  113. "ldr q6, [%[inptr6]], #16\n"
  114. "zip1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3
  115. "zip2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
  116. "zip1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3
  117. "zip2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
  118. "ldr q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
  119. "ldr q5, [%[inptr5]], #16\n"
  120. "ldr q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
  121. "ldr q7, [%[inptr7]], #16\n"
  122. "zip1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
  123. "zip2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
  124. "zip1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
  125. "zip2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
  126. "zip1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1
  127. "zip2 v20.8h, v8.8h, v9.8h\n"
  128. "zip1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
  129. "zip2 v21.8h, v10.8h, v11.8h\n"
  130. "cbnz %w[skippf], 2f\n"
  131. ASM_PREFETCH("[%[inptr4], #112]")
  132. ASM_PREFETCH("[%[inptr5], #112]")
  133. ASM_PREFETCH("[%[inptr6], #112]")
  134. ASM_PREFETCH("[%[inptr7], #112]")
  135. "2:\n"
  136. "zip1 v22.8h, v16.8h, v17.8h\n"
  137. "zip2 v30.8h, v16.8h, v17.8h\n"
  138. "zip1 v23.8h, v18.8h, v19.8h\n"
  139. "zip2 v31.8h, v18.8h, v19.8h\n"
  140. "zip1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
  141. "zip2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
  142. "str q14, [%[outptr]], #48\n"
  143. "str q15, [%[outptr]], #48\n"
  144. "zip1 v0.8h, v20.8h, v21.8h\n"
  145. "zip2 v1.8h, v20.8h, v21.8h\n"
  146. "str q0, [%[outptr]], #48\n"
  147. "str q1, [%[outptr]], #48\n"
  148. "zip1 v2.8h, v22.8h, v23.8h\n"
  149. "zip2 v3.8h, v22.8h, v23.8h\n"
  150. "str q2, [%[outptr]], #48\n"
  151. "str q3, [%[outptr]], #48\n"
  152. "zip1 v4.8h, v30.8h, v31.8h\n"
  153. "zip2 v5.8h, v30.8h, v31.8h\n"
  154. "str q4, [%[outptr]], #48\n"
  155. "str q5, [%[outptr]], #48\n"
  156. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
  157. [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
  158. [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  159. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
  160. [outptr] "+r"(outptr)
  161. : [skippf] "r"(skippf)
  162. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
  163. "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
  164. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  165. "v31", "cc", "memory");
  166. }
  167. template <typename T>
  168. static inline void interleave_16x1_8_h_helper(
  169. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  170. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  171. T*& outptr, int skippf = 0) {
  172. static_assert(sizeof(T) == 2, "only support size == 2");
  173. asm volatile(
  174. // Load up 8 elements (1 vector) from each of 8 sources.
  175. "cbnz %w[skippf], 1f\n"
  176. ASM_PREFETCH("[%[inptr0], #128]")
  177. ASM_PREFETCH("[%[inptr1], #128]")
  178. ASM_PREFETCH("[%[inptr2], #128]")
  179. ASM_PREFETCH("[%[inptr3], #128]")
  180. "1:\n"
  181. "ldr q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
  182. "ldr q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
  183. "ldr q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
  184. "ldr q6, [%[inptr6]], #16\n"
  185. "zip1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3
  186. "zip2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
  187. "zip1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3
  188. "zip2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
  189. "ldr q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
  190. "ldr q5, [%[inptr5]], #16\n"
  191. "ldr q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
  192. "ldr q7, [%[inptr7]], #16\n"
  193. "zip1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
  194. "zip2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
  195. "zip1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
  196. "zip2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
  197. "zip1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1
  198. "zip2 v20.8h, v8.8h, v9.8h\n"
  199. "zip1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
  200. "zip2 v21.8h, v10.8h, v11.8h\n"
  201. "cbnz %w[skippf], 2f\n"
  202. ASM_PREFETCH("[%[inptr4], #112]")
  203. ASM_PREFETCH("[%[inptr5], #112]")
  204. ASM_PREFETCH("[%[inptr6], #112]")
  205. ASM_PREFETCH("[%[inptr7], #112]")
  206. "2:\n"
  207. "zip1 v22.8h, v16.8h, v17.8h\n"
  208. "zip2 v30.8h, v16.8h, v17.8h\n"
  209. "zip1 v23.8h, v18.8h, v19.8h\n"
  210. "zip2 v31.8h, v18.8h, v19.8h\n"
  211. "zip1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
  212. "zip2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
  213. "str q14, [%[outptr]], #32\n"
  214. "str q15, [%[outptr]], #32\n"
  215. "zip1 v0.8h, v20.8h, v21.8h\n"
  216. "zip2 v1.8h, v20.8h, v21.8h\n"
  217. "str q0, [%[outptr]], #32\n"
  218. "str q1, [%[outptr]], #32\n"
  219. "zip1 v2.8h, v22.8h, v23.8h\n"
  220. "zip2 v3.8h, v22.8h, v23.8h\n"
  221. "str q2, [%[outptr]], #32\n"
  222. "str q3, [%[outptr]], #32\n"
  223. "zip1 v4.8h, v30.8h, v31.8h\n"
  224. "zip2 v5.8h, v30.8h, v31.8h\n"
  225. "str q4, [%[outptr]], #32\n"
  226. "str q5, [%[outptr]], #32\n"
  227. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
  228. [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
  229. [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  230. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
  231. [outptr] "+r"(outptr)
  232. : [skippf] "r"(skippf)
  233. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
  234. "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
  235. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  236. "v31", "cc", "memory");
  237. }
  238. template <typename T>
  239. static inline void interleave_8x1_8_h(
  240. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  241. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  242. T*& outptr, int skippf = 0) {
  243. static_assert(sizeof(T) == 2, "only support size == 2");
  244. asm volatile(
  245. // Load up 8 elements (1 vector) from each of 8 sources.
  246. "cbnz %w[skippf], 1f\n"
  247. ASM_PREFETCH("[%[inptr0], #128]")
  248. ASM_PREFETCH("[%[inptr1], #128]")
  249. ASM_PREFETCH("[%[inptr2], #128]")
  250. ASM_PREFETCH("[%[inptr3], #128]")
  251. "1:\n"
  252. "ldr q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
  253. "ldr q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
  254. "ldr q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
  255. "ldr q6, [%[inptr6]], #16\n"
  256. "zip1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3
  257. "zip2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
  258. "zip1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3
  259. "zip2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
  260. "ldr q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
  261. "ldr q5, [%[inptr5]], #16\n"
  262. "ldr q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
  263. "ldr q7, [%[inptr7]], #16\n"
  264. "zip1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
  265. "zip2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
  266. "zip1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
  267. "zip2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
  268. "zip1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1
  269. "zip2 v20.8h, v8.8h, v9.8h\n"
  270. "zip1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
  271. "zip2 v21.8h, v10.8h, v11.8h\n"
  272. "cbnz %w[skippf], 2f\n"
  273. ASM_PREFETCH("[%[inptr4], #112]")
  274. ASM_PREFETCH("[%[inptr5], #112]")
  275. ASM_PREFETCH("[%[inptr6], #112]")
  276. ASM_PREFETCH("[%[inptr7], #112]")
  277. "2:\n"
  278. "zip1 v22.8h, v16.8h, v17.8h\n"
  279. "zip2 v30.8h, v16.8h, v17.8h\n"
  280. "zip1 v23.8h, v18.8h, v19.8h\n"
  281. "zip2 v31.8h, v18.8h, v19.8h\n"
  282. "zip1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
  283. "zip2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
  284. "stp q14, q15, [%[outptr]], #32\n" // Write back first two elements
  285. "zip1 v0.8h, v20.8h, v21.8h\n"
  286. "zip2 v1.8h, v20.8h, v21.8h\n"
  287. "stp q0, q1, [%[outptr]], #32\n" // Write back next two elements
  288. "zip1 v2.8h, v22.8h, v23.8h\n"
  289. "zip2 v3.8h, v22.8h, v23.8h\n"
  290. "stp q2, q3, [%[outptr]], #32\n" // Write back next two elements
  291. "zip1 v4.8h, v30.8h, v31.8h\n"
  292. "zip2 v5.8h, v30.8h, v31.8h\n"
  293. "stp q4, q5, [%[outptr]], #32\n" // Write back last two elements
  294. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
  295. [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3),
  296. [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  297. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
  298. [outptr] "+r"(outptr)
  299. : [skippf] "r"(skippf)
  300. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
  301. "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
  302. "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
  303. "v31", "cc", "memory");
  304. }
  305. template <typename T>
  306. static inline void interleave_4x1_4_h(
  307. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  308. T*& outptr) {
  309. static_assert(sizeof(T) == 2, "only support size == 2");
  310. asm volatile(
  311. "ldr d0, [%[inptr0]], #8\n" // d0 = A0A1A2A3
  312. "ldr d1, [%[inptr1]], #8\n" // d1 = B0B1B2B3
  313. "ldr d2, [%[inptr2]], #8\n" // d2 = C0C1C2C3
  314. "ldr d3, [%[inptr3]], #8\n" // d3 = D0D1D2D3
  315. "zip1 v4.4h, v0.4h, v2.4h\n" // d4 = A0C0A1C1
  316. "zip2 v8.4h, v0.4h, v2.4h\n" // d8 = A2C2A3C3
  317. "zip1 v5.4h, v1.4h, v3.4h\n" // d5 = B0D0B1D1
  318. "zip2 v9.4h, v1.4h, v3.4h\n" // d9 = B2D2B3D3
  319. "zip1 v6.4h, v4.4h, v5.4h\n" // d6 = A0B0C0D0
  320. "zip2 v7.4h, v4.4h, v5.4h\n" // d7 = A1B1C1D1
  321. "stp d6, d7, [%[outptr]], #16\n"
  322. "zip1 v10.4h, v8.4h, v9.4h\n" // d10 = A2B2C2D2
  323. "zip2 v11.4h, v8.4h, v9.4h\n" // d11 = A3B3C3D3
  324. "stp d10, d11, [%[outptr]], #16\n"
  325. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  326. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  327. :
  328. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  329. "memory");
  330. }
  331. static inline void interleave_4x1_2_d(
  332. const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
  333. const int64_t*& inptr3, int64_t*& outptr) {
  334. asm volatile(
  335. "ld1 {v0.2d}, [%[inptr0]], #16\n" // d0 = A0A1
  336. "ld1 {v1.2d}, [%[inptr1]], #16\n" // d1 = B0B1
  337. "ld1 {v2.2d}, [%[inptr2]], #16\n" // d2 = C0C1
  338. "ld1 {v3.2d}, [%[inptr3]], #16\n" // d3 = D0D1
  339. "zip1 v4.2d, v0.2d, v1.2d\n" // d8 = A0B0
  340. "zip2 v5.2d, v0.2d, v1.2d\n" // d9 = A1B1
  341. "zip1 v6.2d, v2.2d, v3.2d\n" // d10 = C0D0
  342. "zip2 v7.2d, v2.2d, v3.2d\n" // d11 = C1D1
  343. "st1 {v4.2d}, [%[outptr]], #16\n"
  344. "st1 {v6.2d}, [%[outptr]], #16\n"
  345. "st1 {v5.2d}, [%[outptr]], #16\n"
  346. "st1 {v7.2d}, [%[outptr]], #16\n"
  347. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  348. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  349. :
  350. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory");
  351. }
  352. static inline void interleave_4x2_2_d(
  353. const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
  354. const int64_t*& inptr3, int64_t*& outptr) {
  355. asm volatile(
  356. "ld1 {v0.2d}, [%[inptr0]], #16\n" // d0 = A0
  357. "ld1 {v1.2d}, [%[inptr0]], #16\n" // d1 = A1
  358. "ld1 {v2.2d}, [%[inptr1]], #16\n" // d2 = B0
  359. "ld1 {v3.2d}, [%[inptr1]], #16\n" // d3 = B1
  360. "ld1 {v4.2d}, [%[inptr2]], #16\n" // d4 = C0
  361. "ld1 {v5.2d}, [%[inptr2]], #16\n" // d5 = C1
  362. "ld1 {v6.2d}, [%[inptr3]], #16\n" // d6 = D0
  363. "ld1 {v7.2d}, [%[inptr3]], #16\n" // d7 = D1
  364. "st1 {v0.2d}, [%[outptr]], #16\n"
  365. "st1 {v2.2d}, [%[outptr]], #16\n"
  366. "st1 {v4.2d}, [%[outptr]], #16\n"
  367. "st1 {v6.2d}, [%[outptr]], #16\n"
  368. "st1 {v1.2d}, [%[outptr]], #16\n"
  369. "st1 {v3.2d}, [%[outptr]], #16\n"
  370. "st1 {v5.2d}, [%[outptr]], #16\n"
  371. "st1 {v7.2d}, [%[outptr]], #16\n"
  372. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  373. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  374. :
  375. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory");
  376. }
  377. static inline void interleave_12x1_4_s(
  378. const int32_t*& inptr0, const int32_t*& inptr1, const int32_t*& inptr2,
  379. const int32_t*& inptr3, const int32_t*& inptr4, const int32_t*& inptr5,
  380. const int32_t*& inptr6, const int32_t*& inptr7, const int32_t*& inptr8,
  381. const int32_t*& inptr9, const int32_t*& inptr10, const int32_t*& inptr11,
  382. int32_t*& outptr) {
  383. asm volatile(
  384. "ld1 {v0.4s}, [%[inptr0]], #16\n" // d0 = A0A1A2A3
  385. "ld1 {v1.4s}, [%[inptr1]], #16\n" // d1 = B0B1B2B3
  386. "ld1 {v2.4s}, [%[inptr2]], #16\n" // d2 = C0C1C2C3
  387. "ld1 {v3.4s}, [%[inptr3]], #16\n" // d3 = D0D1D2D3
  388. "zip1 v12.4s, v0.4s, v2.4s\n" // d12 = A0C0A1C1
  389. "zip2 v13.4s, v0.4s, v2.4s\n" // d13 = A2C2A3C3
  390. "zip1 v14.4s, v1.4s, v3.4s\n" // d14 = B0D0B1D1
  391. "zip2 v15.4s, v1.4s, v3.4s\n" // d15 = B2D2B3D3
  392. "zip1 v0.4s, v12.4s, v14.4s\n" // d0 = A0B0C0D0
  393. "zip2 v1.4s, v12.4s, v14.4s\n" // d1 = A1B1C1D1
  394. "zip1 v2.4s, v13.4s, v15.4s\n" // d2 = A2B2C2D2
  395. "zip2 v3.4s, v13.4s, v15.4s\n" // d3 = A3B3C3D3
  396. "ld1 {v4.4s}, [%[inptr4]], #16\n" // d4 = E0E1E2E3
  397. "ld1 {v5.4s}, [%[inptr5]], #16\n" // d5 = F0F1F2F3
  398. "ld1 {v6.4s}, [%[inptr6]], #16\n" // d6 = G0G1G2G3
  399. "ld1 {v7.4s}, [%[inptr7]], #16\n" // d7 = H0H1H2H3
  400. "zip1 v16.4s, v4.4s, v6.4s\n" // d16 = E0G0E1G1
  401. "zip2 v17.4s, v4.4s, v6.4s\n" // d17 = E2G2E3G3
  402. "zip1 v18.4s, v5.4s, v7.4s\n" // d18 = F0H0F1H1
  403. "zip2 v19.4s, v5.4s, v7.4s\n" // d19 = F2H2F3H3
  404. "zip1 v4.4s, v16.4s, v18.4s\n" // d4 = E0F0G0H0
  405. "zip2 v5.4s, v16.4s, v18.4s\n" // d5 = E1F1G1H1
  406. "zip1 v6.4s, v17.4s, v19.4s\n" // d6 = E2F2G2H2
  407. "zip2 v7.4s, v17.4s, v19.4s\n" // d7 = E3F3G3H3
  408. "ld1 {v8.4s}, [%[inptr8]], #16\n" // d8 = I0I1I2I3
  409. "ld1 {v9.4s}, [%[inptr9]], #16\n" // d9 = J0J1J2J3
  410. "ld1 {v10.4s}, [%[inptr10]], #16\n" // d10 = K0K1K2K3
  411. "ld1 {v11.4s}, [%[inptr11]], #16\n" // d11 = L0L1L2L3
  412. "zip1 v20.4s, v8.4s, v10.4s\n" // d20 = I0K0I1K1
  413. "zip2 v21.4s, v8.4s, v10.4s\n" // d21 = I2K2I3K3
  414. "zip1 v22.4s, v9.4s, v11.4s\n" // d22 = J0L0J1L1
  415. "zip2 v23.4s, v9.4s, v11.4s\n" // d23 = J2L2J3L3
  416. "zip1 v8.4s, v20.4s, v22.4s\n" // d8 = I0J0K0L0
  417. "zip2 v9.4s, v20.4s, v22.4s\n" // d9 = I1J1K1L1
  418. "zip1 v10.4s, v21.4s, v23.4s\n" // d10 = I2J2K2L2
  419. "zip2 v11.4s, v21.4s, v23.4s\n" // d11 = I3J3K3L3
  420. "st1 {v0.4s}, [%[outptr]], #16\n"
  421. "st1 {v4.4s}, [%[outptr]], #16\n"
  422. "st1 {v8.4s}, [%[outptr]], #16\n"
  423. "st1 {v1.4s}, [%[outptr]], #16\n"
  424. "st1 {v5.4s}, [%[outptr]], #16\n"
  425. "st1 {v9.4s}, [%[outptr]], #16\n"
  426. "st1 {v2.4s}, [%[outptr]], #16\n"
  427. "st1 {v6.4s}, [%[outptr]], #16\n"
  428. "st1 {v10.4s}, [%[outptr]], #16\n"
  429. "st1 {v3.4s}, [%[outptr]], #16\n"
  430. "st1 {v7.4s}, [%[outptr]], #16\n"
  431. "st1 {v11.4s}, [%[outptr]], #16\n"
  432. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  433. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  434. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [inptr8] "+r"(inptr8),
  435. [inptr9] "+r"(inptr9), [inptr10] "+r"(inptr10), [inptr11] "+r"(inptr11),
  436. [outptr] "+r"(outptr)
  437. :
  438. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  439. "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
  440. "v22", "v23", "cc", "memory");
  441. }
  442. template <typename T>
  443. static inline void interleave_12x1_4_h(
  444. const T*& in0, const T*& in1, const T*& in2, const T*& in3, const T*& in4,
  445. const T*& in5, const T*& in6, const T*& in7, const T*& in8, const T*& in9,
  446. const T*& in10, const T*& in11, T*& out) {
  447. static_assert(
  448. std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
  449. "interleave_12x1_4_h only support uint16_t and int16_t");
  450. const int16_t*& inptr0 = reinterpret_cast<const int16_t*&>(in0);
  451. const int16_t*& inptr1 = reinterpret_cast<const int16_t*&>(in1);
  452. const int16_t*& inptr2 = reinterpret_cast<const int16_t*&>(in2);
  453. const int16_t*& inptr3 = reinterpret_cast<const int16_t*&>(in3);
  454. const int16_t*& inptr4 = reinterpret_cast<const int16_t*&>(in4);
  455. const int16_t*& inptr5 = reinterpret_cast<const int16_t*&>(in5);
  456. const int16_t*& inptr6 = reinterpret_cast<const int16_t*&>(in6);
  457. const int16_t*& inptr7 = reinterpret_cast<const int16_t*&>(in7);
  458. const int16_t*& inptr8 = reinterpret_cast<const int16_t*&>(in8);
  459. const int16_t*& inptr9 = reinterpret_cast<const int16_t*&>(in9);
  460. const int16_t*& inptr10 = reinterpret_cast<const int16_t*&>(in10);
  461. const int16_t*& inptr11 = reinterpret_cast<const int16_t*&>(in11);
  462. int16_t*& outptr = reinterpret_cast<int16_t*&>(out);
  463. asm volatile(
  464. "ld1 {v0.4h}, [%[inptr0]], #8\n" // d0 = A0A1A2A3
  465. "ld1 {v1.4h}, [%[inptr1]], #8\n" // d1 = B0B1B2B3
  466. "ld1 {v2.4h}, [%[inptr2]], #8\n" // d2 = C0C1C2C3
  467. "ld1 {v3.4h}, [%[inptr3]], #8\n" // d3 = D0D1D2D3
  468. "zip1 v12.4h, v0.4h, v2.4h\n" // d12 = A0C0A1C1
  469. "zip2 v13.4h, v0.4h, v2.4h\n" // d13 = A2C2A3C3
  470. "zip1 v14.4h, v1.4h, v3.4h\n" // d14 = B0D0B1D1
  471. "zip2 v15.4h, v1.4h, v3.4h\n" // d15 = B2D2B3D3
  472. "zip1 v0.4h, v12.4h, v14.4h\n" // d0 = A0B0C0D0
  473. "zip2 v1.4h, v12.4h, v14.4h\n" // d1 = A1B1C1D1
  474. "zip1 v2.4h, v13.4h, v15.4h\n" // d2 = A2B2C2D2
  475. "zip2 v3.4h, v13.4h, v15.4h\n" // d3 = A3B3C3D3
  476. "ld1 {v4.4h}, [%[inptr4]], #8\n" // d4 = E0E1E2E3
  477. "ld1 {v5.4h}, [%[inptr5]], #8\n" // d5 = F0F1F2F3
  478. "ld1 {v6.4h}, [%[inptr6]], #8\n" // d6 = G0G1G2G3
  479. "ld1 {v7.4h}, [%[inptr7]], #8\n" // d7 = H0H1H2H3
  480. "zip1 v16.4h, v4.4h, v6.4h\n" // d16 = E0G0E1G1
  481. "zip2 v17.4h, v4.4h, v6.4h\n" // d17 = E2G2E3G3
  482. "zip1 v18.4h, v5.4h, v7.4h\n" // d18 = F0H0F1H1
  483. "zip2 v19.4h, v5.4h, v7.4h\n" // d19 = F2H2F3H3
  484. "zip1 v4.4h, v16.4h, v18.4h\n" // d4 = E0F0G0H0
  485. "zip2 v5.4h, v16.4h, v18.4h\n" // d5 = E1F1G1H1
  486. "zip1 v6.4h, v17.4h, v19.4h\n" // d6 = E2F2G2H2
  487. "zip2 v7.4h, v17.4h, v19.4h\n" // d7 = E3F3G3H3
  488. "ld1 {v8.4h}, [%[inptr8]], #8\n" // d8 = I0I1I2I3
  489. "ld1 {v9.4h}, [%[inptr9]], #8\n" // d9 = J0J1J2J3
  490. "ld1 {v10.4h}, [%[inptr10]], #8\n" // d10 = K0K1K2K3
  491. "ld1 {v11.4h}, [%[inptr11]], #8\n" // d11 = L0L1L2L3
  492. "zip1 v20.4h, v8.4h, v10.4h\n" // d20 = I0K0I1K1
  493. "zip2 v21.4h, v8.4h, v10.4h\n" // d21 = I2K2I3K3
  494. "zip1 v22.4h, v9.4h, v11.4h\n" // d22 = J0L0J1L1
  495. "zip2 v23.4h, v9.4h, v11.4h\n" // d23 = J2L2J3L3
  496. "zip1 v8.4h, v20.4h, v22.4h\n" // d8 = I0J0K0L0
  497. "zip2 v9.4h, v20.4h, v22.4h\n" // d9 = I1J1K1L1
  498. "zip1 v10.4h, v21.4h, v23.4h\n" // d10 = I2J2K2L2
  499. "zip2 v11.4h, v21.4h, v23.4h\n" // d11 = I3J3K3L3
  500. "st1 {v0.4h}, [%[outptr]], #8\n" // d0 = A0B0C0D0
  501. "st1 {v4.4h}, [%[outptr]], #8\n" // d4 = E0F0G0H0
  502. "st1 {v8.4h}, [%[outptr]], #8\n" // d8 = I0J0K0L0
  503. "st1 {v1.4h}, [%[outptr]], #8\n" // d1 = A1B1C1D1
  504. "st1 {v5.4h}, [%[outptr]], #8\n" // d5 = E1F1G1H1
  505. "st1 {v9.4h}, [%[outptr]], #8\n" // d9 = I1J1K1L1
  506. "st1 {v2.4h}, [%[outptr]], #8\n" // d2 = A2B2C2D2
  507. "st1 {v6.4h}, [%[outptr]], #8\n" // d6 = E2F2G2H2
  508. "st1 {v10.4h}, [%[outptr]], #8\n" // d10 = I2J2K2L2
  509. "st1 {v3.4h}, [%[outptr]], #8\n" // d3 = A3B3C3D3
  510. "st1 {v7.4h}, [%[outptr]], #8\n" // d7 = E3F3G3H3
  511. "st1 {v11.4h}, [%[outptr]], #8\n" // d11 = I3J3K3L3
  512. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  513. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  514. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [inptr8] "+r"(inptr8),
  515. [inptr9] "+r"(inptr9), [inptr10] "+r"(inptr10), [inptr11] "+r"(inptr11),
  516. [outptr] "+r"(outptr)
  517. :
  518. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  519. "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
  520. "v22", "v23", "cc", "memory");
  521. }
  522. template <typename T>
  523. static inline void interleave_12x4_4_b(
  524. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  525. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  526. const T*& inptr8, const T*& inptr9, const T*& inptr10, const T*& inptr11,
  527. T*& outptr) {
  528. static_assert(
  529. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  530. "interleave_12x4_4_b only support uint8_t and int8_t");
  531. interleave_12x1_4_s(
  532. reinterpret_cast<const int32_t*&>(inptr0),
  533. reinterpret_cast<const int32_t*&>(inptr1),
  534. reinterpret_cast<const int32_t*&>(inptr2),
  535. reinterpret_cast<const int32_t*&>(inptr3),
  536. reinterpret_cast<const int32_t*&>(inptr4),
  537. reinterpret_cast<const int32_t*&>(inptr5),
  538. reinterpret_cast<const int32_t*&>(inptr6),
  539. reinterpret_cast<const int32_t*&>(inptr7),
  540. reinterpret_cast<const int32_t*&>(inptr8),
  541. reinterpret_cast<const int32_t*&>(inptr9),
  542. reinterpret_cast<const int32_t*&>(inptr10),
  543. reinterpret_cast<const int32_t*&>(inptr11),
  544. reinterpret_cast<int32_t*&>(outptr));
  545. }
  546. static inline void interleave_2x1_4_s(
  547. const int32_t*& inptr0, const int32_t*& inptr1, int32_t*& outptr) {
  548. asm volatile(
  549. "ld1 {v0.4s}, [%[inptr0]], #16\n" // d0 = A0A1A2A3
  550. "ld1 {v1.4s}, [%[inptr1]], #16\n" // d1 = B0B1B2B3
  551. "st1 {v0.4s}, [%[outptr]], #16\n"
  552. "st1 {v1.4s}, [%[outptr]], #16\n"
  553. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
  554. :
  555. : "v0", "v1", "cc", "memory");
  556. }
  557. static inline void interleave_8x1_4_s(
  558. const int32_t*& inptr0, const int32_t*& inptr1, const int32_t*& inptr2,
  559. const int32_t*& inptr3, const int32_t*& inptr4, const int32_t*& inptr5,
  560. const int32_t*& inptr6, const int32_t*& inptr7, int32_t*& outptr) {
  561. asm volatile(
  562. "ld1 {v0.4s}, [%[inptr0]], #16\n" // d0 = A0A1A2A3
  563. "ld1 {v1.4s}, [%[inptr1]], #16\n" // d1 = B0B1B2B3
  564. "ld1 {v2.4s}, [%[inptr2]], #16\n" // d2 = C0C1C2C3
  565. "ld1 {v3.4s}, [%[inptr3]], #16\n" // d3 = D0D1D2D3
  566. "zip1 v8.4s, v0.4s, v2.4s\n" // d8 = A0C0A1C1
  567. "zip2 v9.4s, v0.4s, v2.4s\n" // d9 = A2C2A3C3
  568. "zip1 v10.4s, v1.4s, v3.4s\n" // d10 = B0D0B1D1
  569. "zip2 v11.4s, v1.4s, v3.4s\n" // d11 = B2D2B3D3
  570. "zip1 v12.4s, v8.4s, v10.4s\n" // d12 = A0B0C0D0
  571. "zip2 v13.4s, v8.4s, v10.4s\n" // d13 = A1B1C1D1
  572. "zip1 v14.4s, v9.4s, v11.4s\n" // d14 = A2B2C2D2
  573. "zip2 v15.4s, v9.4s, v11.4s\n" // d15 = A3B3C3D3
  574. "ld1 {v4.4s}, [%[inptr4]], #16\n" // d4 = E0E1E2E3
  575. "ld1 {v5.4s}, [%[inptr5]], #16\n" // d5 = F0F1F2F3
  576. "ld1 {v6.4s}, [%[inptr6]], #16\n" // d6 = G0G1G2G3
  577. "ld1 {v7.4s}, [%[inptr7]], #16\n" // d7 = H0H1H2H3
  578. "zip1 v16.4s, v4.4s, v6.4s\n" // d16 = E0G0E1G1
  579. "zip2 v17.4s, v4.4s, v6.4s\n" // d17 = E2G2E3G3
  580. "zip1 v18.4s, v5.4s, v7.4s\n" // d18 = F0H0F1H1
  581. "zip2 v19.4s, v5.4s, v7.4s\n" // d19 = F2H2F3H3
  582. "zip1 v20.4s, v16.4s, v18.4s\n" // d20 = E0F0G0H0
  583. "zip2 v21.4s, v16.4s, v18.4s\n" // d21 = E1F1G1H1
  584. "zip1 v22.4s, v17.4s, v19.4s\n" // d22 = E2F2G2H2
  585. "zip2 v23.4s, v17.4s, v19.4s\n" // d23 = E3F3G3H3
  586. "st1 {v12.4s}, [%[outptr]], #16\n"
  587. "st1 {v20.4s}, [%[outptr]], #16\n"
  588. "st1 {v13.4s}, [%[outptr]], #16\n"
  589. "st1 {v21.4s}, [%[outptr]], #16\n"
  590. "st1 {v14.4s}, [%[outptr]], #16\n"
  591. "st1 {v22.4s}, [%[outptr]], #16\n"
  592. "st1 {v15.4s}, [%[outptr]], #16\n"
  593. "st1 {v23.4s}, [%[outptr]], #16\n"
  594. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  595. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  596. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  597. :
  598. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  599. "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
  600. "v22", "v23", "cc", "memory");
  601. }
  602. static inline void interleave_8x1_2_d(
  603. const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
  604. const int64_t*& inptr3, const int64_t*& inptr4, const int64_t*& inptr5,
  605. const int64_t*& inptr6, const int64_t*& inptr7, int64_t*& outptr) {
  606. asm volatile(
  607. "ld1 {v0.2d}, [%[inptr0]], #16\n" // d0 = A0A1
  608. "ld1 {v1.2d}, [%[inptr1]], #16\n" // d1 = B0B1
  609. "ld1 {v2.2d}, [%[inptr2]], #16\n" // d2 = C0C1
  610. "ld1 {v3.2d}, [%[inptr3]], #16\n" // d3 = D0D1
  611. "ld1 {v4.2d}, [%[inptr4]], #16\n" // d4 = E0E1
  612. "ld1 {v5.2d}, [%[inptr5]], #16\n" // d5 = F0F1
  613. "ld1 {v6.2d}, [%[inptr6]], #16\n" // d6 = G0G1
  614. "ld1 {v7.2d}, [%[inptr7]], #16\n" // d7 = H0H1
  615. "zip1 v8.2d, v0.2d, v1.2d\n" // d8 = A0B0
  616. "zip2 v9.2d, v0.2d, v1.2d\n" // d9 = A1B1
  617. "zip1 v10.2d, v2.2d, v3.2d\n" // d10 = C0D0
  618. "zip2 v11.2d, v2.2d, v3.2d\n" // d11 = C1D1
  619. "zip1 v12.2d, v4.2d, v5.2d\n" // d12 = E0F0
  620. "zip2 v13.2d, v4.2d, v5.2d\n" // d13 = E1F1
  621. "zip1 v14.2d, v6.2d, v7.2d\n" // d14 = G0H0
  622. "zip2 v15.2d, v6.2d, v7.2d\n" // d15 = G1H1
  623. "st1 {v8.2d}, [%[outptr]], #16\n"
  624. "st1 {v10.2d}, [%[outptr]], #16\n"
  625. "st1 {v12.2d}, [%[outptr]], #16\n"
  626. "st1 {v14.2d}, [%[outptr]], #16\n"
  627. "st1 {v9.2d}, [%[outptr]], #16\n"
  628. "st1 {v11.2d}, [%[outptr]], #16\n"
  629. "st1 {v13.2d}, [%[outptr]], #16\n"
  630. "st1 {v15.2d}, [%[outptr]], #16\n"
  631. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  632. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  633. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  634. :
  635. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  636. "v12", "v13", "v14", "v15", "cc", "memory");
  637. }
  638. static inline void interleave_8x2_2_d(
  639. const int64_t*& inptr0, const int64_t*& inptr1, const int64_t*& inptr2,
  640. const int64_t*& inptr3, const int64_t*& inptr4, const int64_t*& inptr5,
  641. const int64_t*& inptr6, const int64_t*& inptr7, int64_t*& outptr) {
  642. asm volatile(
  643. "ld1 {v0.2d}, [%[inptr0]], #16\n" // d0 = A0
  644. "ld1 {v1.2d}, [%[inptr0]], #16\n" // d1 = A1
  645. "ld1 {v2.2d}, [%[inptr1]], #16\n" // d2 = B0
  646. "ld1 {v3.2d}, [%[inptr1]], #16\n" // d3 = B1
  647. "ld1 {v4.2d}, [%[inptr2]], #16\n" // d4 = C0
  648. "ld1 {v5.2d}, [%[inptr2]], #16\n" // d5 = C1
  649. "ld1 {v6.2d}, [%[inptr3]], #16\n" // d6 = D0
  650. "ld1 {v7.2d}, [%[inptr3]], #16\n" // d7 = D1
  651. "ld1 {v8.2d}, [%[inptr4]], #16\n" // d8 = E0
  652. "ld1 {v9.2d}, [%[inptr4]], #16\n" // d9 = E1
  653. "ld1 {v10.2d}, [%[inptr5]], #16\n" // d10 = F0
  654. "ld1 {v11.2d}, [%[inptr5]], #16\n" // d11 = F1
  655. "ld1 {v12.2d}, [%[inptr6]], #16\n" // d12 = G0
  656. "ld1 {v13.2d}, [%[inptr6]], #16\n" // d13 = G1
  657. "ld1 {v14.2d}, [%[inptr7]], #16\n" // d14 = H0
  658. "ld1 {v15.2d}, [%[inptr7]], #16\n" // d15 = H1
  659. "st1 {v0.2d}, [%[outptr]], #16\n"
  660. "st1 {v2.2d}, [%[outptr]], #16\n"
  661. "st1 {v4.2d}, [%[outptr]], #16\n"
  662. "st1 {v6.2d}, [%[outptr]], #16\n"
  663. "st1 {v8.2d}, [%[outptr]], #16\n"
  664. "st1 {v10.2d}, [%[outptr]], #16\n"
  665. "st1 {v12.2d}, [%[outptr]], #16\n"
  666. "st1 {v14.2d}, [%[outptr]], #16\n"
  667. "st1 {v1.2d}, [%[outptr]], #16\n"
  668. "st1 {v3.2d}, [%[outptr]], #16\n"
  669. "st1 {v5.2d}, [%[outptr]], #16\n"
  670. "st1 {v7.2d}, [%[outptr]], #16\n"
  671. "st1 {v9.2d}, [%[outptr]], #16\n"
  672. "st1 {v11.2d}, [%[outptr]], #16\n"
  673. "st1 {v13.2d}, [%[outptr]], #16\n"
  674. "st1 {v15.2d}, [%[outptr]], #16\n"
  675. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  676. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  677. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  678. :
  679. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  680. "v12", "v13", "v14", "v15", "cc", "memory");
  681. }
  682. template <typename T>
  683. static inline void interleave_2x4_4_b(const T*& inptr0, const T*& inptr1, T*& outptr) {
  684. static_assert(
  685. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  686. "interleave_2x4_4_b only support uint8_t and int8_t");
  687. interleave_2x1_4_s(
  688. reinterpret_cast<const int32_t*&>(inptr0),
  689. reinterpret_cast<const int32_t*&>(inptr1),
  690. reinterpret_cast<int32_t*&>(outptr));
  691. }
  692. template <typename T>
  693. static inline void interleave_8x4_4_b(
  694. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  695. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  696. T*& outptr) {
  697. static_assert(
  698. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  699. "interleave_8x4_4_b only support uint8_t and int8_t");
  700. interleave_8x1_4_s(
  701. reinterpret_cast<const int32_t*&>(inptr0),
  702. reinterpret_cast<const int32_t*&>(inptr1),
  703. reinterpret_cast<const int32_t*&>(inptr2),
  704. reinterpret_cast<const int32_t*&>(inptr3),
  705. reinterpret_cast<const int32_t*&>(inptr4),
  706. reinterpret_cast<const int32_t*&>(inptr5),
  707. reinterpret_cast<const int32_t*&>(inptr6),
  708. reinterpret_cast<const int32_t*&>(inptr7),
  709. reinterpret_cast<int32_t*&>(outptr));
  710. }
  711. template <typename T>
  712. static inline void interleave_8x4_1_h(
  713. const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
  714. static_assert(sizeof(T) == 2, "only support size == 2");
  715. asm volatile(
  716. "ldr q0, [%[in0]], #16\n" // A1A2A3A4A5A6A7A8
  717. "ldr q1, [%[in1]], #16\n" // B1B2B3B4B5B6B7B8
  718. "ldr q2, [%[in2]], #16\n" // C1C2C3C4C5C6C7C8
  719. "ldr q3, [%[in3]], #16\n" // D1D2D3D4D5D6D7D8
  720. "trn1 v4.8h, v0.8h, v1.8h\n" // A1B1A3B3A5B5A7B7
  721. "trn2 v5.8h, v0.8h, v1.8h\n" // A2B2A4B4A6B6A8B8
  722. "trn1 v6.8h, v2.8h, v3.8h\n" // C1D1C3D3C5D5C7D7
  723. "trn2 v7.8h, v2.8h, v3.8h\n" // C2D2C4D4C6D6C8D8
  724. "zip1 v8.4s, v4.4s, v6.4s\n" // A1B1C1D1A3B3C3D3
  725. "zip2 v9.4s, v4.4s, v6.4s\n" // A5B5C5D5A7B7C7D7
  726. "zip1 v10.4s, v5.4s, v7.4s\n" // A2B2C2D2A4B4C4D4
  727. "zip2 v11.4s, v5.4s, v7.4s\n" // A6B6C6D6A8B8C8D8
  728. "zip1 v12.2d, v8.2d, v10.2d\n" // A1B1C1D1A2B2C2D2
  729. "zip2 v13.2d, v8.2d, v10.2d\n" // A3B3C3D3A4B4C4D4
  730. "zip1 v14.2d, v9.2d, v11.2d\n" // A5B5C5D5A6B6C6D6
  731. "zip2 v15.2d, v9.2d, v11.2d\n" // A7B7C7D7A8B8C8D8
  732. "st1 {v12.2d}, [%[out]], #16\n"
  733. "st1 {v13.2d}, [%[out]], #16\n"
  734. "st1 {v14.2d}, [%[out]], #16\n"
  735. "st1 {v15.2d}, [%[out]], #16\n"
  736. : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3),
  737. [out] "+r"(out)
  738. :
  739. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  740. "v12", "v13", "v14", "v15", "memory");
  741. }
  742. template <typename T>
  743. static inline void interleave_8x8_2_b(
  744. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  745. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  746. T*& outptr) {
  747. static_assert(
  748. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  749. "interleave_8x8_2_b only support uint8_t and int8_t");
  750. interleave_8x1_2_d(
  751. reinterpret_cast<const int64_t*&>(inptr0),
  752. reinterpret_cast<const int64_t*&>(inptr1),
  753. reinterpret_cast<const int64_t*&>(inptr2),
  754. reinterpret_cast<const int64_t*&>(inptr3),
  755. reinterpret_cast<const int64_t*&>(inptr4),
  756. reinterpret_cast<const int64_t*&>(inptr5),
  757. reinterpret_cast<const int64_t*&>(inptr6),
  758. reinterpret_cast<const int64_t*&>(inptr7),
  759. reinterpret_cast<int64_t*&>(outptr));
  760. }
  761. template <typename T>
  762. static inline void interleave_8x8_2_h(
  763. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  764. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  765. T*& outptr) {
  766. static_assert(
  767. std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
  768. "interleave_8x8_2_h only support uint16_t and int16_t");
  769. interleave_8x2_2_d(
  770. reinterpret_cast<const int64_t*&>(inptr0),
  771. reinterpret_cast<const int64_t*&>(inptr1),
  772. reinterpret_cast<const int64_t*&>(inptr2),
  773. reinterpret_cast<const int64_t*&>(inptr3),
  774. reinterpret_cast<const int64_t*&>(inptr4),
  775. reinterpret_cast<const int64_t*&>(inptr5),
  776. reinterpret_cast<const int64_t*&>(inptr6),
  777. reinterpret_cast<const int64_t*&>(inptr7),
  778. reinterpret_cast<int64_t*&>(outptr));
  779. }
  780. template <typename T>
  781. static inline void interleave_8x2_8_b(
  782. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  783. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  784. T*& outptr) {
  785. static_assert(
  786. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  787. "interleave_8x2_8_b only support uint8_t and int8_t");
  788. interleave_8x1_8_h(
  789. reinterpret_cast<const int16_t*&>(inptr0),
  790. reinterpret_cast<const int16_t*&>(inptr1),
  791. reinterpret_cast<const int16_t*&>(inptr2),
  792. reinterpret_cast<const int16_t*&>(inptr3),
  793. reinterpret_cast<const int16_t*&>(inptr4),
  794. reinterpret_cast<const int16_t*&>(inptr5),
  795. reinterpret_cast<const int16_t*&>(inptr6),
  796. reinterpret_cast<const int16_t*&>(inptr7),
  797. reinterpret_cast<int16_t*&>(outptr));
  798. }
  799. template <typename T>
  800. static inline void interleave_8x8_1_b(
  801. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  802. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  803. T*& outptr) {
  804. static_assert(
  805. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  806. "interleave_8x8_1_b only support uint8_t and int8_t");
  807. asm volatile(
  808. "ld1 {v0.d}[0], [%[inptr0]], 8\n" // A1A2A3A4A5A6A7A8
  809. "ld1 {v0.d}[1], [%[inptr1]], 8\n" // B1B2B3B4B5B6B7B8
  810. "ld1 {v1.d}[0], [%[inptr2]], 8\n" // C1C2C3C4C5C6C7C8
  811. "ld1 {v1.d}[1], [%[inptr3]], 8\n" // D1D2D3D4D5D6D7D8
  812. "ld1 {v2.d}[0], [%[inptr4]], 8\n" // E1E2E3E4E5E6E7E8
  813. "ld1 {v2.d}[1], [%[inptr5]], 8\n" // F1F2F3F4F5F6F7F8
  814. "ld1 {v3.d}[0], [%[inptr6]], 8\n" // G1G2G3G4G5G6G7G8
  815. "ld1 {v3.d}[1], [%[inptr7]], 8\n" // H1H2H3H4H5H6H7H8
  816. "st1 {v0.2d}, [%[outptr]], 16\n" // A1A2A3A4A5A6A7A8B1B2B3B4B5B6B7B8
  817. "st1 {v1.2d}, [%[outptr]], 16\n" // C1C2C3C4C5C6C7C8D1D2D3D4D5D6D7D8
  818. "st1 {v2.2d}, [%[outptr]], 16\n" // E1E2E3E4E5E6E7E8F1F2F3F4F5F6F7F8
  819. "st1 {v3.2d}, [%[outptr]], 16\n" // G1G2G3G4G5G6G7G8H1H2H3H4H5H6H7H8
  820. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  821. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  822. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  823. :
  824. : "v0", "v1", "v2", "v3", "memory");
  825. }
  826. template <typename T>
  827. static inline void interleave_8x4_1_b_with_shift(
  828. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  829. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  830. T* outptr) {
  831. static_assert(sizeof(T) == 1, "only support size == 1");
  832. asm volatile(
  833. "ld1 {v0.s}[0], [%[inptr0]], #4\n"
  834. "ld1 {v0.s}[1], [%[inptr1]], #4\n"
  835. "ld1 {v0.s}[2], [%[inptr2]], #4\n"
  836. "ld1 {v0.s}[3], [%[inptr3]], #4\n"
  837. "ld1 {v1.s}[0], [%[inptr4]], #4\n"
  838. "ld1 {v1.s}[1], [%[inptr5]], #4\n"
  839. "ld1 {v1.s}[2], [%[inptr6]], #4\n"
  840. "ld1 {v1.s}[3], [%[inptr7]], #4\n"
  841. "shl v2.16b, v0.16b, #4\n"
  842. "shl v5.16b, v1.16b, #4\n"
  843. "sshr v3.16b, v0.16b, #4\n" // hig
  844. "sshr v4.16b, v2.16b, #4\n" // low
  845. "sshr v6.16b, v1.16b, #4\n" // hig
  846. "sshr v7.16b, v5.16b, #4\n" // low
  847. "zip1 v8.16b, v4.16b, v3.16b\n"
  848. "zip2 v9.16b, v4.16b, v3.16b\n"
  849. "zip1 v10.16b, v7.16b, v6.16b\n"
  850. "zip2 v11.16b, v7.16b, v6.16b\n"
  851. "st1 {v8.16b-v11.16b},[%[outptr]],#64"
  852. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  853. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  854. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  855. :
  856. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  857. "memory");
  858. }
  859. template <typename T>
  860. static inline void interleave_8x8_1_h(
  861. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  862. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  863. T*& outptr) {
  864. static_assert(
  865. std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
  866. "interleave_8x8_1_h only support uint16_t and int16_t");
  867. asm volatile(
  868. "ld1 {v0.8h}, [%[inptr0]], #16\n" // A1A2A3A4A5A6A7A8
  869. "ld1 {v1.8h}, [%[inptr1]], #16\n" // B1B2B3B4B5B6B7B8
  870. "ld1 {v2.8h}, [%[inptr2]], #16\n" // C1C2C3C4C5C6C7C8
  871. "ld1 {v3.8h}, [%[inptr3]], #16\n" // D1D2D3D4D5D6D7D8
  872. "ld1 {v4.8h}, [%[inptr4]], #16\n" // E1E2E3E4E5E6E7E8
  873. "ld1 {v5.8h}, [%[inptr5]], #16\n" // F1F2F3F4F5F6F7F8
  874. "ld1 {v6.8h}, [%[inptr6]], #16\n" // G1G2G3G4G5G6G7G8
  875. "ld1 {v7.8h}, [%[inptr7]], #16\n" // H1H2H3H4H5H6H7H8
  876. "st1 {v0.8h}, [%[outptr]], #16\n" // A1A2A3A4A5A6A7A8
  877. "st1 {v1.8h}, [%[outptr]], #16\n" // B1B2B3B4B5B6B7B8
  878. "st1 {v2.8h}, [%[outptr]], #16\n" // C1C2C3C4C5C6C7C8
  879. "st1 {v3.8h}, [%[outptr]], #16\n" // D1D2D3D4D5D6D7D8
  880. "st1 {v4.8h}, [%[outptr]], #16\n" // E1E2E3E4E5E6E7E8
  881. "st1 {v5.8h}, [%[outptr]], #16\n" // F1F2F3F4F5F6F7F8
  882. "st1 {v6.8h}, [%[outptr]], #16\n" // G1G2G3G4G5G6G7G8
  883. "st1 {v7.8h}, [%[outptr]], #16\n" // H1H2H3H4H5H6H7H8
  884. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  885. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  886. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  887. :
  888. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  889. }
  890. static inline void interleave_4x1_4_s(
  891. const int32_t*& inptr0, const int32_t*& inptr1, const int32_t*& inptr2,
  892. const int32_t*& inptr3, int32_t*& outptr) {
  893. asm volatile(
  894. "ld1 {v0.4s}, [%[inptr0]], #16\n" // d0 = A0A1A2A3
  895. "ld1 {v1.4s}, [%[inptr1]], #16\n" // d1 = B0B1B2B3
  896. "ld1 {v2.4s}, [%[inptr2]], #16\n" // d2 = C0C1C2C3
  897. "ld1 {v3.4s}, [%[inptr3]], #16\n" // d3 = D0D1D2D3
  898. "zip1 v8.4s, v0.4s, v2.4s\n" // d8 = A0C0A1C1
  899. "zip2 v9.4s, v0.4s, v2.4s\n" // d9 = A2C2A3C3
  900. "zip1 v10.4s, v1.4s, v3.4s\n" // d10 = B0D0B1D1
  901. "zip2 v11.4s, v1.4s, v3.4s\n" // d11 = B2D2B3D3
  902. "zip1 v12.4s, v8.4s, v10.4s\n" // d12 = A0B0C0D0
  903. "zip2 v13.4s, v8.4s, v10.4s\n" // d13 = A1B1C1D1
  904. "zip1 v14.4s, v9.4s, v11.4s\n" // d14 = A2B2C2D2
  905. "zip2 v15.4s, v9.4s, v11.4s\n" // d15 = A3B3C3D3
  906. "st1 {v12.4s}, [%[outptr]], #16\n"
  907. "st1 {v13.4s}, [%[outptr]], #16\n"
  908. "st1 {v14.4s}, [%[outptr]], #16\n"
  909. "st1 {v15.4s}, [%[outptr]], #16\n"
  910. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  911. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  912. :
  913. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  914. "v12", "v13", "v14", "v15", "cc", "memory");
  915. }
  916. template <typename T>
  917. static inline void interleave_4x8_1_s(
  918. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  919. T*& outptr) {
  920. static_assert(sizeof(T) == 4, "only support size == 4");
  921. asm volatile(
  922. "ld1 {v0.4s, v1.4s}, [%[inptr0]], #32\n"
  923. "ld1 {v2.4s, v3.4s}, [%[inptr1]], #32\n"
  924. "ld1 {v4.4s, v5.4s}, [%[inptr2]], #32\n"
  925. "ld1 {v6.4s, v7.4s}, [%[inptr3]], #32\n"
  926. "st1 {v0.4s, v1.4s}, [%[outptr]], #32\n"
  927. "st1 {v2.4s, v3.4s}, [%[outptr]], #32\n"
  928. "st1 {v4.4s, v5.4s}, [%[outptr]], #32\n"
  929. "st1 {v6.4s, v7.4s}, [%[outptr]], #32\n"
  930. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  931. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  932. :
  933. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "cc", "memory");
  934. }
  935. template <typename T>
  936. static inline void interleave_4x12_1_s(
  937. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  938. T*& outptr) {
  939. static_assert(sizeof(T) == 4, "only support size == 4");
  940. asm volatile(
  941. "ld1 {v0.4s, v1.4s, v2.4s}, [%[inptr0]], #48\n"
  942. "ld1 {v4.4s, v5.4s, v6.4s}, [%[inptr1]], #48\n"
  943. "ld1 {v8.4s, v9.4s, v10.4s}, [%[inptr2]], #48\n"
  944. "ld1 {v12.4s, v13.4s, v14.4s}, [%[inptr3]], #48\n"
  945. "st1 {v0.4s, v1.4s, v2.4s}, [%[outptr]], #48\n"
  946. "st1 {v4.4s, v5.4s, v6.4s}, [%[outptr]], #48\n"
  947. "st1 {v8.4s, v9.4s, v10.4s}, [%[outptr]], #48\n"
  948. "st1 {v12.4s, v13.4s, v14.4s}, [%[outptr]], #48\n"
  949. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  950. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  951. :
  952. : "v0", "v1", "v2", "v4", "v5", "v6", "v8", "v9", "v10", "v12", "v13",
  953. "v14", "cc", "memory");
  954. }
  955. template <typename T>
  956. static inline void interleave_4x16_1_b(
  957. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  958. T*& outptr) {
  959. static_assert(sizeof(T) == 1, "only support size == 1");
  960. asm volatile(
  961. "ld1 {v0.4s}, [%[inptr0]], #16\n" // d0 = A0A1A2A3
  962. "ld1 {v1.4s}, [%[inptr1]], #16\n" // d1 = B0B1B2B3
  963. "ld1 {v2.4s}, [%[inptr2]], #16\n" // d2 = C0C1C2C3
  964. "ld1 {v3.4s}, [%[inptr3]], #16\n" // d3 = D0D1D2D3
  965. "st1 {v0.4s}, [%[outptr]], #16\n"
  966. "st1 {v1.4s}, [%[outptr]], #16\n"
  967. "st1 {v2.4s}, [%[outptr]], #16\n"
  968. "st1 {v3.4s}, [%[outptr]], #16\n"
  969. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  970. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  971. :
  972. : "v0", "v1", "v2", "v3", "v4", "cc", "memory");
  973. }
  974. template <typename T>
  975. static inline void interleave_4x16_1_s(
  976. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  977. T*& outptr) {
  978. static_assert(sizeof(T) == 4, "only support size == 4");
  979. asm volatile(
  980. "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  981. "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[inptr1]], #64\n"
  982. "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[inptr2]], #64\n"
  983. "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[inptr3]], #64\n"
  984. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]], #64\n"
  985. "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[outptr]], #64\n"
  986. "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[outptr]], #64\n"
  987. "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[outptr]], #64\n"
  988. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  989. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  990. :
  991. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  992. "v12", "v13", "v14", "v15", "cc", "memory");
  993. }
  994. template <typename T>
  995. static inline void interleave_4x2_4_b(
  996. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  997. T*& outptr) {
  998. static_assert(
  999. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1000. "interleave_4x2_4_b only support uint8_t and int8_t");
  1001. interleave_4x1_4_h(
  1002. reinterpret_cast<const int16_t*&>(inptr0),
  1003. reinterpret_cast<const int16_t*&>(inptr1),
  1004. reinterpret_cast<const int16_t*&>(inptr2),
  1005. reinterpret_cast<const int16_t*&>(inptr3),
  1006. reinterpret_cast<int16_t*&>(outptr));
  1007. }
  1008. template <typename T>
  1009. static inline void interleave_4x4_4_b(
  1010. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1011. T*& outptr) {
  1012. static_assert(
  1013. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1014. "interleave_4x4_4_b only support uint8_t and int8_t");
  1015. interleave_4x1_4_s(
  1016. reinterpret_cast<const int32_t*&>(inptr0),
  1017. reinterpret_cast<const int32_t*&>(inptr1),
  1018. reinterpret_cast<const int32_t*&>(inptr2),
  1019. reinterpret_cast<const int32_t*&>(inptr3),
  1020. reinterpret_cast<int32_t*&>(outptr));
  1021. }
  1022. template <typename T>
  1023. static inline void interleave_4x4_1_s(
  1024. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1025. T*& outptr) {
  1026. static_assert(sizeof(T) == 4, "interleave_4x4_1_s only support size == 4");
  1027. asm volatile(
  1028. "ld1 {v0.4s}, [%[inptr0]], #16\n"
  1029. "ld1 {v1.4s}, [%[inptr1]], #16\n"
  1030. "ld1 {v2.4s}, [%[inptr2]], #16\n"
  1031. "ld1 {v3.4s}, [%[inptr3]], #16\n"
  1032. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]], #64\n"
  1033. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1034. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  1035. :
  1036. : "v0", "v1", "v2", "v3", "cc", "memory");
  1037. }
  1038. template <typename T>
  1039. static inline void interleave_2x4_4_s(const T*& inptr0, const T*& inptr1, T* outptr) {
  1040. static_assert(sizeof(T) == 4, "interleave_2x4_4_s only support size == 4");
  1041. asm volatile(
  1042. "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  1043. "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[inptr1]], #64\n"
  1044. "stp q0, q4, [%[outptr]]\n"
  1045. "stp q1, q5, [%[outptr], #32]\n"
  1046. "stp q2, q6, [%[outptr], #64]\n"
  1047. "stp q3, q7, [%[outptr], #96]\n"
  1048. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
  1049. :
  1050. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  1051. }
  1052. template <typename T>
  1053. static inline void interleave_1x4_4_s(const T*& inptr0, T* outptr) {
  1054. static_assert(sizeof(T) == 4, "interleave_1x4_4_s only support size == 4");
  1055. asm volatile(
  1056. "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  1057. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]]\n"
  1058. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1059. :
  1060. : "v0", "v1", "v2", "v3", "memory");
  1061. }
  1062. template <typename T>
  1063. static inline void interleave_4x8_2_b(
  1064. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1065. T*& outptr) {
  1066. static_assert(
  1067. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1068. "interleave_4x8_2_b only support uint8_t and int8_t");
  1069. interleave_4x1_2_d(
  1070. reinterpret_cast<const int64_t*&>(inptr0),
  1071. reinterpret_cast<const int64_t*&>(inptr1),
  1072. reinterpret_cast<const int64_t*&>(inptr2),
  1073. reinterpret_cast<const int64_t*&>(inptr3),
  1074. reinterpret_cast<int64_t*&>(outptr));
  1075. }
  1076. template <typename T>
  1077. static inline void interleave_4x8_2_h(
  1078. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1079. T*& outptr) {
  1080. static_assert(
  1081. std::is_same<T, int16_t>::value || std::is_same<T, uint16_t>::value,
  1082. "interleave_4x8_2_h only support uint16_t and int16_t");
  1083. interleave_4x2_2_d(
  1084. reinterpret_cast<const int64_t*&>(inptr0),
  1085. reinterpret_cast<const int64_t*&>(inptr1),
  1086. reinterpret_cast<const int64_t*&>(inptr2),
  1087. reinterpret_cast<const int64_t*&>(inptr3),
  1088. reinterpret_cast<int64_t*&>(outptr));
  1089. }
  1090. template <typename T>
  1091. static inline void interleave_1x16_1_s(const T*& inptr0, T*& outptr) {
  1092. static_assert(sizeof(T) == 4, "interleave_1x16_1_s only support size == 4");
  1093. asm volatile(
  1094. "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  1095. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]], #64\n"
  1096. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1097. :
  1098. : "v0", "v1", "v2", "v3", "cc", "memory");
  1099. }
  1100. template <typename T>
  1101. static inline void interleave_1x12_1_s(const T*& inptr0, T*& outptr) {
  1102. static_assert(sizeof(T) == 4, "interleave_1x12_1_s only support size == 4");
  1103. asm volatile(
  1104. "ld1 {v0.4s, v1.4s, v2.4s}, [%[inptr0]], #48\n"
  1105. "st1 {v0.4s, v1.4s, v2.4s}, [%[outptr]], #48\n"
  1106. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1107. :
  1108. : "v0", "v1", "v2", "cc", "memory");
  1109. }
  1110. template <typename T>
  1111. static inline void interleave_1x8_1_s(const T*& inptr0, T*& outptr) {
  1112. static_assert(sizeof(T) == 4, "interleave_1x8_1_s only support size == 4");
  1113. asm volatile(
  1114. "ld1 {v0.4s, v1.4s}, [%[inptr0]], #32\n"
  1115. "st1 {v0.4s, v1.4s}, [%[outptr]], #32\n"
  1116. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1117. :
  1118. : "v0", "v1", "cc", "memory");
  1119. }
  1120. template <typename T>
  1121. static inline void interleave_1x4_1_s(const T*& inptr0, T*& outptr) {
  1122. static_assert(sizeof(T) == 4, "interleave_1x4_1_s only support size == 4");
  1123. asm volatile(
  1124. "ld1 {v0.4s}, [%[inptr0]], #16\n"
  1125. "st1 {v0.4s}, [%[outptr]], #16\n"
  1126. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1127. :
  1128. : "v0", "cc", "memory");
  1129. }
  1130. template <typename T>
  1131. static inline void interleave_helper(
  1132. const T*& inptr, T*& outptr, int unroll_k, int ksize, T val = 0) {
  1133. int k = 0;
  1134. for (; k < ksize; k++) {
  1135. *outptr++ = *inptr++;
  1136. }
  1137. for (; k < unroll_k; k++) {
  1138. *outptr++ = val;
  1139. }
  1140. }
  1141. template <typename T>
  1142. static inline void interleave_1(
  1143. const T*& inptr0, T*& outptr, int unroll_k, int ksize, T val = 0) {
  1144. for (int k = 0; k < ksize; k += unroll_k) {
  1145. int size = std::min(unroll_k, ksize - k);
  1146. interleave_helper(inptr0, outptr, unroll_k, size, val);
  1147. }
  1148. }
  1149. template <typename T>
  1150. static inline void interleave_4(
  1151. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1152. T*& outptr, int unroll_k, int ksize, T val = 0) {
  1153. for (int k = 0; k < ksize; k += unroll_k) {
  1154. int size = std::min(unroll_k, ksize - k);
  1155. interleave_helper(inptr0, outptr, unroll_k, size, val);
  1156. interleave_helper(inptr1, outptr, unroll_k, size, val);
  1157. interleave_helper(inptr2, outptr, unroll_k, size, val);
  1158. interleave_helper(inptr3, outptr, unroll_k, size, val);
  1159. }
  1160. }
  1161. template <typename T>
  1162. static inline void interleave_8(
  1163. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1164. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1165. T*& outptr, int unroll_k, int ksize, T val = 0) {
  1166. for (int k = 0; k < ksize; k += unroll_k) {
  1167. int size = std::min(unroll_k, ksize - k);
  1168. interleave_helper(inptr0, outptr, unroll_k, size, val);
  1169. interleave_helper(inptr1, outptr, unroll_k, size, val);
  1170. interleave_helper(inptr2, outptr, unroll_k, size, val);
  1171. interleave_helper(inptr3, outptr, unroll_k, size, val);
  1172. interleave_helper(inptr4, outptr, unroll_k, size, val);
  1173. interleave_helper(inptr5, outptr, unroll_k, size, val);
  1174. interleave_helper(inptr6, outptr, unroll_k, size, val);
  1175. interleave_helper(inptr7, outptr, unroll_k, size, val);
  1176. }
  1177. }
  1178. template <typename T>
  1179. static inline void interleave_12(
  1180. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1181. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1182. const T*& inptr8, const T*& inptr9, const T*& inptr10, const T*& inptr11,
  1183. T*& outptr, int unroll_k, int ksize) {
  1184. for (int k = 0; k < ksize; k += unroll_k) {
  1185. int size = std::min(unroll_k, ksize - k);
  1186. interleave_helper(inptr0, outptr, unroll_k, size);
  1187. interleave_helper(inptr1, outptr, unroll_k, size);
  1188. interleave_helper(inptr2, outptr, unroll_k, size);
  1189. interleave_helper(inptr3, outptr, unroll_k, size);
  1190. interleave_helper(inptr4, outptr, unroll_k, size);
  1191. interleave_helper(inptr5, outptr, unroll_k, size);
  1192. interleave_helper(inptr6, outptr, unroll_k, size);
  1193. interleave_helper(inptr7, outptr, unroll_k, size);
  1194. interleave_helper(inptr8, outptr, unroll_k, size);
  1195. interleave_helper(inptr9, outptr, unroll_k, size);
  1196. interleave_helper(inptr10, outptr, unroll_k, size);
  1197. interleave_helper(inptr11, outptr, unroll_k, size);
  1198. }
  1199. }
  1200. /* ======================== transpose pack B ======================== */
  1201. /**
  1202. * transpose_INTERLEAVE_UNROLLK_BATCH_type
  1203. *
  1204. * BATCH means process BATCH * INTERLEAVE cols once, BATCH * sizeof(TYPE) *
  1205. * INTERLEAVE = 16bytes(128bits, a vector size).
  1206. *
  1207. * the elements traverse order:
  1208. * rep(j, 0, INTERLEAVE) rep(i, 0, UNROLL_K) *ouptr++ = inptr[i, j]
  1209. */
  1210. template <typename T>
  1211. static inline void transpose_24x4_1_h(
  1212. const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
  1213. static_assert(sizeof(T) == 2, "only support size == 2");
  1214. asm volatile(
  1215. "ldp q0, q1, [%[in0]], #32\n"
  1216. "stp q0, q1, [%[out]]\n"
  1217. "ldr q2, [%[in0]], #16\n"
  1218. ASM_PREFETCH("[%[in0], #192]")
  1219. "ldp q3, q4, [%[in1]], #32\n"
  1220. "stp q2, q3, [%[out], #32]\n"
  1221. "ldr q5, [%[in1]], #16\n"
  1222. ASM_PREFETCH("[%[in1], #192]")
  1223. "stp q4, q5, [%[out], #64]\n"
  1224. "ldp q6, q7, [%[in2]], #32\n"
  1225. "stp q6, q7, [%[out], #96]\n"
  1226. "ldr q8, [%[in2]], #16\n"
  1227. ASM_PREFETCH("[%[in2], #192]")
  1228. "ldp q9, q10, [%[in3]], #32\n"
  1229. "stp q8, q9, [%[out], #128]\n"
  1230. "ldr q11, [%[in3]], #16\n"
  1231. "stp q10, q11, [%[out], #160]\n"
  1232. ASM_PREFETCH("[%[in3], #192]")
  1233. : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2),
  1234. [in3] "+r"(in3), [out] "+r"(out)
  1235. :
  1236. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
  1237. "v11", "memory");
  1238. }
  1239. template <typename T>
  1240. static inline void transpose_16x4_1_h(
  1241. const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
  1242. static_assert(sizeof(T) == 2, "only support size == 2");
  1243. asm volatile(
  1244. "ldp q0, q1, [%[in0]], #32\n"
  1245. "stp q0, q1, [%[out]]\n"
  1246. "ldp q2, q3, [%[in1]], #32\n"
  1247. "stp q2, q3, [%[out], #32]\n"
  1248. "ldp q4, q5, [%[in2]], #32\n"
  1249. "stp q4, q5, [%[out], #64]\n"
  1250. "ldp q6, q7, [%[in3]], #32\n"
  1251. "stp q6, q7, [%[out], #96]\n"
  1252. : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3),
  1253. [out] "+r"(out)
  1254. :
  1255. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  1256. }
  1257. template <typename T>
  1258. static inline void transpose_8x4_1_h(
  1259. const T*& in0, const T*& in1, const T*& in2, const T*& in3, T* out) {
  1260. static_assert(sizeof(T) == 2, "only support size == 2");
  1261. asm volatile(
  1262. "ldr q0, [%[in0]], #16\n"
  1263. "str q0, [%[out]]\n"
  1264. "ldr q1, [%[in1]], #16\n"
  1265. "str q1, [%[out], #16]\n"
  1266. "ldr q2, [%[in2]], #16\n"
  1267. "str q2, [%[out], #32]\n"
  1268. "ldr q3, [%[in3]], #16\n"
  1269. "str q3, [%[out], #48]\n"
  1270. : [in0] "+r"(in0), [in1] "+r"(in1), [in2] "+r"(in2), [in3] "+r"(in3),
  1271. [out] "+r"(out)
  1272. :
  1273. : "v0", "v1", "v2", "v3", "memory");
  1274. }
  1275. template <typename T>
  1276. static inline void transpose_24x2_1_h(const T*& in0, const T*& in1, T* out) {
  1277. static_assert(sizeof(T) == 2, "only support size == 2");
  1278. asm volatile(
  1279. "ldp q0, q1, [%[in0]], #32\n"
  1280. "stp q0, q1, [%[out]]\n"
  1281. "ldr q2, [%[in0]], #16\n"
  1282. ASM_PREFETCH("[%[in0], #192]")
  1283. "ldp q3, q4, [%[in1]], #32\n"
  1284. "stp q2, q3, [%[out], #32]\n"
  1285. "ldr q5, [%[in1]], #16\n"
  1286. ASM_PREFETCH("[%[in1], #192]")
  1287. "stp q4, q5, [%[out], #64]\n"
  1288. : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
  1289. :
  1290. : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
  1291. }
  1292. template <typename T>
  1293. static inline void transpose_16x2_1_h(const T*& in0, const T*& in1, T* out) {
  1294. static_assert(sizeof(T) == 2, "only support size == 2");
  1295. asm volatile(
  1296. "ldp q0, q1, [%[in0]], #32\n"
  1297. "stp q0, q1, [%[out]]\n"
  1298. "ldp q2, q3, [%[in1]], #32\n"
  1299. "stp q2, q3, [%[out], #32]\n"
  1300. : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
  1301. :
  1302. : "v0", "v1", "v2", "v3", "memory");
  1303. }
  1304. template <typename T>
  1305. static inline void transpose_8x2_1_h(const T*& in0, const T*& in1, T* out) {
  1306. static_assert(sizeof(T) == 2, "only support size == 2");
  1307. asm volatile(
  1308. "ldr q0, [%[in0]], #16\n"
  1309. "str q0, [%[out]]\n"
  1310. "ldr q1, [%[in1]], #16\n"
  1311. "str q1, [%[out], #16]\n"
  1312. : [in0] "+r"(in0), [in1] "+r"(in1), [out] "+r"(out)
  1313. :
  1314. : "v0", "v1", "memory");
  1315. }
  1316. template <typename T>
  1317. static inline void transpose_24x1_1_h(const T*& in0, T* out) {
  1318. static_assert(sizeof(T) == 2, "only support size == 2");
  1319. // clang-format off
  1320. asm volatile(
  1321. "ldp q0, q1, [%[in0]], #32\n"
  1322. "stp q0, q1, [%[out]] \n"
  1323. "ldr q2, [%[in0]], #16 \n"
  1324. ASM_PREFETCH("[%[in0], #192]")
  1325. "str q2, [%[out], #32] \n"
  1326. : [in0] "+r"(in0), [out] "+r"(out)
  1327. :
  1328. : "v0", "v1", "v2", "memory");
  1329. // clang-format on
  1330. }
  1331. template <typename T>
  1332. static inline void transpose_16x1_1_h(const T*& in0, T* out) {
  1333. static_assert(sizeof(T) == 2, "only support size == 2");
  1334. asm volatile(
  1335. "ldp q0, q1, [%[in0]], #32\n"
  1336. "stp q0, q1, [%[out]]\n"
  1337. : [in0] "+r"(in0), [out] "+r"(out)
  1338. :
  1339. : "v0", "v1", "memory");
  1340. }
  1341. template <typename T>
  1342. static inline void transpose_12x1_1_h(const T*& in0, T* out) {
  1343. static_assert(sizeof(T) == 2, "only support size == 2");
  1344. // clang-format off
  1345. asm volatile(
  1346. "ld1 {v0.8h}, [%[in0]], #16\n"
  1347. "ld1 {v1.4h}, [%[in0]], #8\n"
  1348. "st1 {v0.8h}, [%[out]], #16\n"
  1349. "st1 {v1.4h}, [%[out]], #8\n"
  1350. : [in0] "+r"(in0), [out] "+r"(out)
  1351. :
  1352. : "v0", "v1", "memory");
  1353. // clang-format on
  1354. }
  1355. template <typename T>
  1356. static inline void transpose_8x1_1_h(const T*& in0, T* out) {
  1357. static_assert(sizeof(T) == 2, "only support size == 2");
  1358. asm volatile(
  1359. "ldr q0, [%[in0]], #16\n"
  1360. "str q0, [%[out]]\n"
  1361. : [in0] "+r"(in0), [out] "+r"(out)
  1362. :
  1363. : "v0", "memory");
  1364. }
  1365. template <typename T>
  1366. static inline void transpose_4x1_1_h(const T*& in0, T* out) {
  1367. static_assert(sizeof(T) == 2, "only support size == 2");
  1368. // clang-format off
  1369. asm volatile(
  1370. "ld1 {v0.4h}, [%[in0]], #8\n"
  1371. "st1 {v0.4h}, [%[out]], #8\n"
  1372. : [in0] "+r"(in0), [out] "+r"(out)
  1373. :
  1374. : "v0", "memory");
  1375. // clang-format on
  1376. }
  1377. template <typename T>
  1378. static inline void transpose_4x4_1_s(
  1379. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1380. T* outptr, int stride = 16) {
  1381. static_assert(sizeof(T) == 4, "transpose_4x4_1_s only support sizeof(T) == 4");
  1382. asm volatile(
  1383. "ld1 {v0.4s}, [%[inptr0]], 16\n" // A0A1A2A3
  1384. "ld1 {v1.4s}, [%[inptr1]], 16\n" // B0B1B2B3
  1385. "ld1 {v2.4s}, [%[inptr2]], 16\n" // C0C1C2C3
  1386. "ld1 {v3.4s}, [%[inptr3]], 16\n" // D0D1D2D3
  1387. "zip1 v4.4s, v0.4s, v1.4s\n"
  1388. "zip1 v5.4s, v2.4s, v3.4s\n"
  1389. "zip2 v6.4s, v0.4s, v1.4s\n"
  1390. "zip2 v7.4s, v2.4s, v3.4s\n"
  1391. "zip1 v8.2d, v4.2d, v5.2d\n"
  1392. "zip1 v9.2d, v6.2d, v7.2d\n"
  1393. "zip2 v10.2d, v4.2d, v5.2d\n"
  1394. "zip2 v11.2d, v6.2d, v7.2d\n"
  1395. "st1 {v8.4s}, [%[outptr]], %x[stride]\n"
  1396. "st1 {v10.4s}, [%[outptr]], %x[stride]\n"
  1397. "st1 {v9.4s}, [%[outptr]], %x[stride]\n"
  1398. "st1 {v11.4s}, [%[outptr]], %x[stride]\n"
  1399. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1400. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr), [stride] "+r"(stride)
  1401. :
  1402. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1403. "memory");
  1404. }
  1405. template <typename T>
  1406. static inline void transpose_1x12_4_s(const T*& inptr0, T* outptr) {
  1407. static_assert(sizeof(T) == 4, "transpose_1x12_4_s only support sizeof(T) == 4");
  1408. asm volatile(
  1409. "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  1410. "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[inptr0]], #64\n"
  1411. "ld4 {v8.4s, v9.4s, v10.4s, v11.4s},[%[inptr0]], #64\n"
  1412. "stp q0, q4, [%[outptr]] \n"
  1413. "stp q8, q1, [%[outptr], #32] \n"
  1414. "stp q5, q9, [%[outptr], #64] \n"
  1415. "stp q2, q6, [%[outptr], #96] \n"
  1416. "stp q10, q3, [%[outptr], #128] \n"
  1417. "stp q7, q11, [%[outptr], #160] \n"
  1418. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1419. :
  1420. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1421. "memory");
  1422. }
  1423. template <typename T>
  1424. static inline void transpose_1x4_4_s(const T*& inptr0, T* outptr) {
  1425. static_assert(sizeof(T) == 4, "transpose_1x4_4_s only support sizeof(T) == 4");
  1426. asm volatile(
  1427. "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[inptr0]], #64\n"
  1428. "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[outptr]]\n"
  1429. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr)
  1430. :
  1431. : "v0", "v1", "v2", "v3", "memory");
  1432. }
  1433. template <typename T>
  1434. static inline void transpose_8x4_1_s(
  1435. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1436. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1437. T* outptr) {
  1438. static_assert(sizeof(T) == 4, "transpose_8x4_1_s only support sizeof(T) == 4");
  1439. asm volatile(
  1440. "ld1 {v0.4s}, [%[inptr0]], 16\n" // A0A1A2A3
  1441. "ld1 {v1.4s}, [%[inptr1]], 16\n" // B0B1B2B3
  1442. "ld1 {v2.4s}, [%[inptr2]], 16\n" // C0C1C2C3
  1443. "ld1 {v3.4s}, [%[inptr3]], 16\n" // D0D1D2D3
  1444. "ld1 {v4.4s}, [%[inptr4]], 16\n" // E0E1E2E3
  1445. "ld1 {v5.4s}, [%[inptr5]], 16\n" // F0F1F2F3
  1446. "ld1 {v6.4s}, [%[inptr6]], 16\n" // G0G1G2G3
  1447. "ld1 {v7.4s}, [%[inptr7]], 16\n" // H0H1H2H3
  1448. "zip1 v8.4s, v0.4s, v1.4s\n" // A0B0A1B1
  1449. "zip2 v9.4s, v0.4s, v1.4s\n" // A2B2A3B3
  1450. "zip1 v10.4s, v2.4s, v3.4s\n" // C0D0C1D1
  1451. "zip2 v11.4s, v2.4s, v3.4s\n" // C2D2C3D3
  1452. "zip1 v12.4s, v4.4s, v5.4s\n" // E0F0E1F1
  1453. "zip2 v13.4s, v4.4s, v5.4s\n" // E2F2E3F3
  1454. "zip1 v14.4s, v6.4s, v7.4s\n" // G0H0G1H1
  1455. "zip2 v15.4s, v6.4s, v7.4s\n" // G2H2G3H3
  1456. "zip1 v0.2d, v8.2d, v10.2d\n" // A0B0C0D0
  1457. "zip2 v2.2d, v8.2d, v10.2d\n" // A1B1C1D1
  1458. "zip1 v4.2d, v9.2d, v11.2d\n" // A2B2C2D2
  1459. "zip2 v6.2d, v9.2d, v11.2d\n" // A3B3C3D3
  1460. "zip1 v1.2d, v12.2d, v14.2d\n" // E0F0G0H0
  1461. "zip2 v3.2d, v12.2d, v14.2d\n" // E1F1G1H1
  1462. "zip1 v5.2d, v13.2d, v15.2d\n" // E2F2G2H2
  1463. "zip2 v7.2d, v13.2d, v15.2d\n" // E3F3G3H3
  1464. "st1 {v0.4s,v1.4s,v2.4s,v3.4s}, [%[outptr]], #64\n"
  1465. "st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [%[outptr]], #64\n"
  1466. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1467. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  1468. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  1469. :
  1470. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1471. "v12", "v13", "v14", "v15", "memory");
  1472. }
  1473. template <typename T>
  1474. static inline void transpose_12x4_1_s(
  1475. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1476. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1477. const T*& inptr8, const T*& inptr9, const T*& inptr10, const T*& inptr11,
  1478. T* outptr) {
  1479. static_assert(sizeof(T) == 4, "transpose_12x4_1_s only support sizeof(T) == 4");
  1480. asm volatile(
  1481. "ld1 {v0.4s}, [%[inptr0]], 16\n" // A0A1A2A3
  1482. "ld1 {v1.4s}, [%[inptr1]], 16\n" // B0B1B2B3
  1483. "ld1 {v2.4s}, [%[inptr2]], 16\n" // C0C1C2C3
  1484. "ld1 {v3.4s}, [%[inptr3]], 16\n" // D0D1D2D3
  1485. "ld1 {v4.4s}, [%[inptr4]], 16\n" // E0E1E2E3
  1486. "ld1 {v5.4s}, [%[inptr5]], 16\n" // F0F1F2F3
  1487. "ld1 {v6.4s}, [%[inptr6]], 16\n" // G0G1G2G3
  1488. "ld1 {v7.4s}, [%[inptr7]], 16\n" // H0H1H2H3
  1489. "ld1 {v16.4s}, [%[inptr8]], 16\n" // I0I1I2I3
  1490. "ld1 {v17.4s}, [%[inptr9]], 16\n" // J0J1J2J3
  1491. "ld1 {v18.4s}, [%[inptr10]], 16\n" // K0K1K2K3
  1492. "ld1 {v19.4s}, [%[inptr11]], 16\n" // L0L1L2L3
  1493. "zip1 v8.4s, v0.4s, v1.4s\n" // A0B0A1B1
  1494. "zip2 v9.4s, v0.4s, v1.4s\n" // A2B2A3B3
  1495. "zip1 v10.4s, v2.4s, v3.4s\n" // C0D0C1D1
  1496. "zip2 v11.4s, v2.4s, v3.4s\n" // C2D2C3D3
  1497. "zip1 v12.4s, v4.4s, v5.4s\n" // E0F0E1F1
  1498. "zip2 v13.4s, v4.4s, v5.4s\n" // E2F2E3F3
  1499. "zip1 v14.4s, v6.4s, v7.4s\n" // G0H0G1H1
  1500. "zip2 v15.4s, v6.4s, v7.4s\n" // G2H2G3H3
  1501. "zip1 v20.4s, v16.4s, v17.4s\n" // I0J0I1J1
  1502. "zip2 v21.4s, v16.4s, v17.4s\n" // I2J2I3J3
  1503. "zip1 v22.4s, v18.4s, v19.4s\n" // K0L0K1L1
  1504. "zip2 v23.4s, v18.4s, v19.4s\n" // K2L2K3L3
  1505. "zip1 v0.2d, v8.2d, v10.2d\n" // A0B0C0D0
  1506. "zip2 v3.2d, v8.2d, v10.2d\n" // A1B1C1D1
  1507. "zip1 v6.2d, v9.2d, v11.2d\n" // A2B2C2D2
  1508. "zip2 v24.2d, v9.2d, v11.2d\n" // A3B3C3D3
  1509. "zip1 v1.2d, v12.2d, v14.2d\n" // E0F0G0H0
  1510. "zip2 v4.2d, v12.2d, v14.2d\n" // E1F1G1H1
  1511. "zip1 v7.2d, v13.2d, v15.2d\n" // E2F2G2H2
  1512. "zip2 v25.2d, v13.2d, v15.2d\n" // E3F3G3H3
  1513. "zip1 v2.2d, v20.2d, v22.2d\n" // I0J0K0L0
  1514. "zip2 v5.2d, v20.2d, v22.2d\n" // I1J1K1L1
  1515. "zip1 v8.2d, v21.2d, v23.2d\n" // I2J2K2L2
  1516. "zip2 v26.2d, v21.2d, v23.2d\n" // I3J3K3L3
  1517. "st1 {v0.4s,v1.4s,v2.4s}, [%[outptr]], #48\n"
  1518. "st1 {v3.4s,v4.4s,v5.4s}, [%[outptr]], #48\n"
  1519. "st1 {v6.4s,v7.4s,v8.4s}, [%[outptr]], #48\n"
  1520. "st1 {v24.4s,v25.4s,v26.4s}, [%[outptr]], #48\n"
  1521. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1522. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  1523. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [inptr8] "+r"(inptr8),
  1524. [inptr9] "+r"(inptr9), [inptr10] "+r"(inptr10), [inptr11] "+r"(inptr11),
  1525. [outptr] "+r"(outptr)
  1526. :
  1527. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1528. "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
  1529. "v22", "v23", "v24", "v25", "v26", "memory");
  1530. }
  1531. template <typename T>
  1532. static inline void transpose_12x4_1_b(
  1533. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1534. T* outptr) {
  1535. static_assert(
  1536. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1537. "transpose_12x4_1_b only support uint8_t and int8_t");
  1538. asm volatile(
  1539. "ldr q0, [%[inptr0]], #12\n" // A1A2A3A4A5A6A7A8A9A10A11A12A13A14A15A16
  1540. "ldr q1, [%[inptr1]], #12\n" // B1B2B3B4B5B6B7B8B9B10B11B12B13B14B15B16
  1541. "ldr q2, [%[inptr2]], #12\n" // C1C2C3C4C5C6C7C8C9C10C11C12C13C14C15C16
  1542. //! \warning the last inptr3 may less than 16bytes, so we should
  1543. //! split read it
  1544. "ldr d3, [%[inptr3]], #8\n" // D1D2D3D4D5D6D7D8D9D10D11D12D13D14D15D16
  1545. "ldr w1, [%[inptr3]], #4\n"
  1546. "ins v3.s[2], w1\n"
  1547. "trn1 v4.16b, v0.16b, v1.16b\n" // v4: A1B1A3B3....
  1548. "trn2 v5.16b, v0.16b, v1.16b\n" // v5: A2B2A4B4....
  1549. "trn1 v6.16b, v2.16b, v3.16b\n" // v6: C1D1C3D3....
  1550. "trn2 v7.16b, v2.16b, v3.16b\n" // v7: C2D2C4D4....
  1551. "trn1 v8.8h, v4.8h, v6.8h\n" // v8: A1B1C1D1A5B5C5D5...
  1552. "trn2 v9.8h, v4.8h, v6.8h\n" // v9: A3B3C3D3A7B7C7D7...
  1553. "trn1 v10.8h, v5.8h, v7.8h\n" // v10: A2B2C2D2A6B6C6D6...
  1554. "trn2 v11.8h, v5.8h, v7.8h\n" // v11: A4B4C4D4A8B8C8D8...
  1555. //! ABCD=E then
  1556. //! v8: E1E5E9E13 v10: E2E6E10E14 v9: E3E7E11E15 v11:
  1557. //! E4E8E12E16
  1558. "zip1 v12.4s, v8.4s, v10.4s\n" // v12: E1E2E5E6
  1559. "zip2 v13.4s, v8.4s, v10.4s\n" // v13: E9E10E13E14
  1560. "zip1 v14.4s, v9.4s, v11.4s\n" // v14: E3E4E7E8
  1561. "zip2 v15.4s, v9.4s, v11.4s\n" // v15: E11E12E15E16
  1562. "zip1 v17.2d, v12.2d, v14.2d\n" // v17: E1E2E3E4
  1563. "zip2 v18.2d, v12.2d, v14.2d\n" // v18: E5E6E7E8
  1564. "zip1 v19.2d, v13.2d, v15.2d\n" // v19: E8E10E11E12
  1565. "zip2 v20.2d, v13.2d, v15.2d\n" // v19: E13E14E15E16
  1566. "stp q17, q18, [%[outptr]], #32\n"
  1567. "str q19, [%[outptr]], #16\n"
  1568. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1569. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  1570. :
  1571. : "w1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
  1572. "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "memory");
  1573. }
  1574. template <typename T>
  1575. static inline void transpose_8x4_1_b(
  1576. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1577. T* outptr) {
  1578. static_assert(
  1579. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1580. "transpose_8x4_1_b only support uint8_t and int8_t");
  1581. asm volatile(
  1582. "ld1 {v0.d}[0], [%[inptr0]], #8\n" // A1A2A3A4A5A6A7A8
  1583. "ld1 {v1.d}[0], [%[inptr1]], #8\n" // B1B2B3B4B5B6B7B8
  1584. "ld1 {v0.d}[1], [%[inptr2]], #8\n" // C1C2C3C4C5C6C7C8
  1585. "ld1 {v1.d}[1], [%[inptr3]], #8\n" // D1D2D3D4D5D6D7D8
  1586. "zip1 v2.16b, v0.16b, v1.16b\n" // A1B1A2B2A3B3A4B4A5B5A6B6A7B7A8B8
  1587. "zip2 v3.16b, v0.16b, v1.16b\n" // C1D1C2D2C3D3C4D4C5D5C6D6C7D7C8D8
  1588. "zip1 v4.8h, v2.8h, v3.8h\n" // A1B1C1D1A2B2C2D2A3B3C3D3A4B4C4D4
  1589. "zip2 v5.8h, v2.8h, v3.8h\n" // A5B5C5D5A6B6C6D6A7B7C7D7A8B8C8D8
  1590. "st1 {v4.2d}, [%[outptr]], #16\n"
  1591. "st1 {v5.2d}, [%[outptr]], #16\n"
  1592. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1593. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
  1594. :
  1595. : "v0", "v1", "v2", "v3", "v4", "v5", "memory");
  1596. }
  1597. template <typename T>
  1598. static inline void transpose_4x8_1_b_with_shift(
  1599. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1600. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1601. T*& outptr) {
  1602. static int8x16_t shuffle_idx = {0, 4, 8, 12, 1, 5, 9, 13,
  1603. 2, 6, 10, 14, 3, 7, 11, 15};
  1604. static_assert(
  1605. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1606. "transpose_8x4_1_b only support uint8_t and int8_t");
  1607. asm volatile(
  1608. "ld1 {v0.s}[0], [%[inptr0]], #4\n" // A1A2A3A4
  1609. "ld1 {v0.s}[1], [%[inptr1]], #4\n" // B1B2B3B4
  1610. "ld1 {v0.s}[2], [%[inptr2]], #4\n" // C1C2C3C4
  1611. "ld1 {v0.s}[3], [%[inptr3]], #4\n" // D1D2D3D4
  1612. "ld1 {v1.s}[0], [%[inptr4]], #4\n" // E1E2E3E4
  1613. "ld1 {v1.s}[1], [%[inptr5]], #4\n" // F1F2F3F4
  1614. "ld1 {v1.s}[2], [%[inptr6]], #4\n" // G1G2G3G4
  1615. "ld1 {v1.s}[3], [%[inptr7]], #4\n" // H1H2H3H4
  1616. "tbl v2.16b, {v0.16b}, %[shuffle_idx].16b \n" // A1B1C1D1A2B2C2D2A3B3C3D3A4B4C4D4
  1617. "tbl v3.16b, {v1.16b}, %[shuffle_idx].16b \n" // E1F1G1H1E2F2G2H2E3F3G3H3E4F4G4H4
  1618. "zip1 v4.4s, v2.4s, v3.4s\n" // A1B1C1D1E1F1G1H1 A2B2C2D2E2F2G2H2
  1619. "zip2 v5.4s, v2.4s, v3.4s\n" // A3B3C3D3E3F3G3H3 A4B4C4D4E4F4G4H4
  1620. "shl v6.16b, v4.16b, #4\n"
  1621. "sshr v7.16b, v4.16b, #4\n" // hig
  1622. "sshr v8.16b, v6.16b, #4\n" // low
  1623. "shl v9.16b, v5.16b, #4\n"
  1624. "sshr v10.16b, v5.16b, #4\n" // hig
  1625. "sshr v11.16b, v9.16b, #4\n" // low
  1626. "zip1 v0.2d,v8.2d,v7.2d\n"
  1627. "zip2 v1.2d,v8.2d,v7.2d\n"
  1628. "zip1 v2.2d,v11.2d,v10.2d\n"
  1629. "zip2 v3.2d,v11.2d,v10.2d\n"
  1630. "st1 {v0.2d-v3.2d},[%[outptr]],#64\n"
  1631. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1632. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  1633. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7),
  1634. [shuffle_idx] "+w"(shuffle_idx), [outptr] "+r"(outptr)
  1635. :
  1636. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1637. "memory");
  1638. }
  1639. template <typename T>
  1640. static inline void transpose_8x8_1_b(
  1641. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1642. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1643. T* outptr) {
  1644. static_assert(
  1645. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1646. "transpose_8x8_1_b only support uint8_t and int8_t");
  1647. asm volatile(
  1648. "ld1 {v0.8b}, [%[inptr0]], #8\n" // A1A2A3A4A5A6A7A8
  1649. "ld1 {v1.8b}, [%[inptr1]], #8\n" // B1B2B3B4B5B6B7B8
  1650. "ld1 {v2.8b}, [%[inptr2]], #8\n" // C1C2C3C4C5C6C7C8
  1651. "ld1 {v3.8b}, [%[inptr3]], #8\n" // D1D2D3D4D5D6D7D8
  1652. "ld1 {v4.8b}, [%[inptr4]], #8\n" // E1E2E3E4E5E6E7E8
  1653. "ld1 {v5.8b}, [%[inptr5]], #8\n" // F1F2F3F4F5F6F7F8
  1654. "ld1 {v6.8b}, [%[inptr6]], #8\n" // G1G2G3G4G5G6G7G8
  1655. "ld1 {v7.8b}, [%[inptr7]], #8\n" // H1H2H3H4H5H6H7H8
  1656. "zip1 v8.16b, v0.16b, v1.16b\n" // A1B1A2B2A3B3A4B4
  1657. // A5B5A6B6A7B7A8B8
  1658. "zip1 v9.16b, v2.16b, v3.16b\n" // C1D1C2D2C3D3C4D4
  1659. // C5D5C6D6C7D7C8D8
  1660. "zip1 v10.16b, v4.16b, v5.16b\n" // E1F1E2F2E3F3E4F4
  1661. // E5F5E6F6E7F7E8F8
  1662. "zip1 v11.16b, v6.16b, v7.16b\n" // G1H1G2H2G3H3G4H4
  1663. // G5H5G6H6G7H7G8H8
  1664. "zip1 v12.8h, v8.8h, v9.8h\n" // A1B1C1D1A2B2C2D2
  1665. // A3B3C3D3A4B4C4D4
  1666. "zip1 v13.8h, v10.8h, v11.8h\n" // E1F1G1H1E2F2G2H2
  1667. // E3F3G3H3E4F4G4H4
  1668. "zip2 v14.8h, v8.8h, v9.8h\n" // A5B5C5D5A6B6C6D6
  1669. // A7B7C7D7A8B8C8D8
  1670. "zip2 v15.8h, v10.8h, v11.8h\n" // E5F5G5H5E6F6G6H6
  1671. // E7F7G7H7E8F8G8H8
  1672. "zip1 v16.4s, v12.4s, v13.4s\n" // A1B1C1D1E1F1G1H1
  1673. // A2B2C2D2E2F2G2H2
  1674. "zip1 v18.4s, v14.4s, v15.4s\n" // A5B5C5D5E5F5G5H5
  1675. // A6B6C6D6E6F6G6H6
  1676. "zip2 v17.4s, v12.4s, v13.4s\n" // A3B3C3D3E3F3G3H3
  1677. // A4B4C4D4E4F4G4H4
  1678. "zip2 v19.4s, v14.4s, v15.4s\n" // A7B7C7D7E7F7G7H7
  1679. // A8B8C8D8E8F8G8H8
  1680. "st1 {v16.16b}, [%[outptr]], #16\n" // A1B1C1D1E1F1G1H1
  1681. // A2B2C2D2E2F2G2H2
  1682. "st1 {v17.16b}, [%[outptr]], #16\n" // A3B3C3D3E3F3G3H3
  1683. // A4B4C4D4E4F4G4H4
  1684. "st1 {v18.16b}, [%[outptr]], #16\n" // A5B5C5D5E5F5G5H5
  1685. // A6B6C6D6E6F6G6H6
  1686. "st1 {v19.16b}, [%[outptr]], #16\n" // A7B7C7D7E7F7G7H7
  1687. // A8B8C8D8E8F8G8H8
  1688. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1689. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  1690. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
  1691. :
  1692. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1693. "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "memory");
  1694. }
  1695. template <typename T>
  1696. static inline void transpose_4x16_1_b_helper(
  1697. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1698. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1699. T* outptr) {
  1700. static_assert(sizeof(T) == 1, "only support size == 1");
  1701. static int8x16_t shuffle_idx = {0, 4, 8, 12, 1, 5, 9, 13,
  1702. 2, 6, 10, 14, 3, 7, 11, 15};
  1703. asm volatile(
  1704. "ld1 {v0.s}[0], [%[inptr0]], #4\n"
  1705. "ld1 {v0.s}[1], [%[inptr1]], #4\n"
  1706. "ld1 {v0.s}[2], [%[inptr2]], #4\n"
  1707. "ld1 {v0.s}[3], [%[inptr3]], #4\n"
  1708. "ld1 {v1.s}[0], [%[inptr4]], #4\n"
  1709. "ld1 {v1.s}[1], [%[inptr5]], #4\n"
  1710. "ld1 {v1.s}[2], [%[inptr6]], #4\n"
  1711. "ld1 {v1.s}[3], [%[inptr7]], #4\n"
  1712. "tbl v2.16b, {v0.16b}, %[shuffle_idx].16b\n"
  1713. "tbl v3.16b, {v1.16b}, %[shuffle_idx].16b\n"
  1714. "zip1 v4.4s, v2.4s, v3.4s\n"
  1715. "zip2 v5.4s, v2.4s, v3.4s\n"
  1716. "dup v6.2d, v4.d[1]\n"
  1717. "dup v7.2d, v5.d[1]\n"
  1718. "str d4, [%[outptr]], #16\n"
  1719. "str d6, [%[outptr]], #16\n"
  1720. "str d5, [%[outptr]], #16\n"
  1721. "str d7, [%[outptr]], #16\n"
  1722. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1723. [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
  1724. [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr),
  1725. [shuffle_idx] "+w"(shuffle_idx)
  1726. :
  1727. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  1728. }
  1729. template <typename T>
  1730. static inline void transpose_4(
  1731. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1732. T* outptr, int interleave, int size, T val = 0) {
  1733. megdnn_assert(size <= interleave);
  1734. int i = 0;
  1735. for (; i < size; i++) {
  1736. *outptr++ = *inptr0++;
  1737. *outptr++ = *inptr1++;
  1738. *outptr++ = *inptr2++;
  1739. *outptr++ = *inptr3++;
  1740. }
  1741. for (; i < interleave; i++) {
  1742. *outptr++ = val;
  1743. *outptr++ = val;
  1744. *outptr++ = val;
  1745. *outptr++ = val;
  1746. }
  1747. }
  1748. template <typename T>
  1749. static inline void transpose_8(
  1750. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1751. const T*& inptr4, const T*& inptr5, const T*& inptr6, const T*& inptr7,
  1752. T* outptr, int interleave, int size, T val = 0) {
  1753. megdnn_assert(size <= interleave);
  1754. int i = 0;
  1755. for (; i < size; i++) {
  1756. *outptr++ = *inptr0++;
  1757. *outptr++ = *inptr1++;
  1758. *outptr++ = *inptr2++;
  1759. *outptr++ = *inptr3++;
  1760. *outptr++ = *inptr4++;
  1761. *outptr++ = *inptr5++;
  1762. *outptr++ = *inptr6++;
  1763. *outptr++ = *inptr7++;
  1764. }
  1765. for (; i < interleave; i++) {
  1766. *outptr++ = val;
  1767. *outptr++ = val;
  1768. *outptr++ = val;
  1769. *outptr++ = val;
  1770. *outptr++ = val;
  1771. *outptr++ = val;
  1772. *outptr++ = val;
  1773. *outptr++ = val;
  1774. }
  1775. }
  1776. /***************************** Transpose then interleave ********************/
  1777. //! pack form {1, 4(icb), 4(ic), 4(oc)} to {1, 1, 4(oc), 16(ic)}
  1778. template <typename T>
  1779. static inline void transpose_interleave_4x4_4_b(
  1780. const T*& inptr0, const T*& inptr1, const T*& inptr2, const T*& inptr3,
  1781. T* outptr, int stride = 64) {
  1782. static_assert(
  1783. sizeof(T) == 1, "transpose_interleave_4x4_4_b only support sizeof(T) == 1");
  1784. asm volatile(
  1785. "ld4 {v0.16b, v1.16b, v2.16b, v3.16b},[%[inptr0]], 64\n"
  1786. "ld4 {v4.16b, v5.16b, v6.16b, v7.16b},[%[inptr1]], 64\n"
  1787. "ld4 {v8.16b, v9.16b, v10.16b, v11.16b},[%[inptr2]], 64\n"
  1788. "ld4 {v12.16b, v13.16b, v14.16b, v15.16b},[%[inptr3]], 64\n"
  1789. "st1 {v0.16b, v1.16b, v2.16b, v3.16b},[%[outptr]], %x[stride]\n"
  1790. "st1 {v4.16b, v5.16b, v6.16b, v7.16b},[%[outptr]], %x[stride]\n"
  1791. "st1 {v8.16b, v9.16b, v10.16b, v11.16b},[%[outptr]], %x[stride]\n"
  1792. "st1 {v12.16b, v13.16b, v14.16b, v15.16b},[%[outptr]], %x[stride]\n"
  1793. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
  1794. [inptr3] "+r"(inptr3), [outptr] "+r"(outptr), [stride] "+r"(stride)
  1795. :
  1796. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
  1797. "v12", "v14", "v15", "memory");
  1798. }
  1799. template <typename T>
  1800. static inline void transpose_interleave_1x4_4_b(
  1801. const T*& inptr0, T* outptr, int stride = 64) {
  1802. static_assert(
  1803. sizeof(T) == 1, "transpose_interleave_1x4_4_b only support sizeof(T) == 1");
  1804. asm volatile(
  1805. "ld4 {v0.16b, v1.16b, v2.16b, v3.16b},[%[inptr0]], 64\n"
  1806. "st1 {v0.16b, v1.16b, v2.16b, v3.16b},[%[outptr]], %x[stride]\n"
  1807. : [inptr0] "+r"(inptr0), [outptr] "+r"(outptr), [stride] "+r"(stride)
  1808. :
  1809. : "v0", "v1", "v2", "v3", "v4", "memory");
  1810. }
  1811. static inline void interleave_4x4_16x4_s8_s16(
  1812. const int8_t* inptr0, const int8_t* inptr1, const int8_t* inptr2,
  1813. const int8_t* inptr3, int16_t* outptr) {
  1814. int8x16_t row0 = vld1q_s8(inptr0);
  1815. int16x8_t row0_01 = vmovl_low_s8(row0);
  1816. int16x8_t row0_23 = vmovl_high_s8(row0);
  1817. int16x4_t row0_0 = vget_low_s16(row0_01);
  1818. int16x4_t row0_1 = vget_high_s16(row0_01);
  1819. int16x4_t row0_2 = vget_low_s16(row0_23);
  1820. int16x4_t row0_3 = vget_high_s16(row0_23);
  1821. int8x16_t row1 = vld1q_s8(inptr1);
  1822. int16x8_t row1_01 = vmovl_low_s8(row1);
  1823. int16x8_t row1_23 = vmovl_high_s8(row1);
  1824. int16x4_t row1_0 = vget_low_s16(row1_01);
  1825. int16x4_t row1_1 = vget_high_s16(row1_01);
  1826. int16x4_t row1_2 = vget_low_s16(row1_23);
  1827. int16x4_t row1_3 = vget_high_s16(row1_23);
  1828. int8x16_t row2 = vld1q_s8(inptr2);
  1829. int16x8_t row2_01 = vmovl_low_s8(row2);
  1830. int16x8_t row2_23 = vmovl_high_s8(row2);
  1831. int16x4_t row2_0 = vget_low_s16(row2_01);
  1832. int16x4_t row2_1 = vget_high_s16(row2_01);
  1833. int16x4_t row2_2 = vget_low_s16(row2_23);
  1834. int16x4_t row2_3 = vget_high_s16(row2_23);
  1835. int8x16_t row3 = vld1q_s8(inptr3);
  1836. int16x8_t row3_01 = vmovl_low_s8(row3);
  1837. int16x8_t row3_23 = vmovl_high_s8(row3);
  1838. int16x4_t row3_0 = vget_low_s16(row3_01);
  1839. int16x4_t row3_1 = vget_high_s16(row3_01);
  1840. int16x4_t row3_2 = vget_low_s16(row3_23);
  1841. int16x4_t row3_3 = vget_high_s16(row3_23);
  1842. vst1_s16(outptr, row0_0);
  1843. vst1_s16(outptr + 1 * 4, row1_0);
  1844. vst1_s16(outptr + 2 * 4, row2_0);
  1845. vst1_s16(outptr + 3 * 4, row3_0);
  1846. vst1_s16(outptr + 4 * 4, row0_1);
  1847. vst1_s16(outptr + 5 * 4, row1_1);
  1848. vst1_s16(outptr + 6 * 4, row2_1);
  1849. vst1_s16(outptr + 7 * 4, row3_1);
  1850. vst1_s16(outptr + 8 * 4, row0_2);
  1851. vst1_s16(outptr + 9 * 4, row1_2);
  1852. vst1_s16(outptr + 10 * 4, row2_2);
  1853. vst1_s16(outptr + 11 * 4, row3_2);
  1854. vst1_s16(outptr + 12 * 4, row0_3);
  1855. vst1_s16(outptr + 13 * 4, row1_3);
  1856. vst1_s16(outptr + 14 * 4, row2_3);
  1857. vst1_s16(outptr + 15 * 4, row3_3);
  1858. };
  1859. static inline void interleave_4x4_8x4_s8_s16(
  1860. const int8_t* inptr0, const int8_t* inptr1, int16_t* outptr) {
  1861. int8x16_t row0 = vld1q_s8(inptr0);
  1862. int16x8_t row0_01 = vmovl_low_s8(row0);
  1863. int16x8_t row0_23 = vmovl_high_s8(row0);
  1864. int16x4_t row0_0 = vget_low_s16(row0_01);
  1865. int16x4_t row0_1 = vget_high_s16(row0_01);
  1866. int16x4_t row0_2 = vget_low_s16(row0_23);
  1867. int16x4_t row0_3 = vget_high_s16(row0_23);
  1868. int8x16_t row1 = vld1q_s8(inptr1);
  1869. int16x8_t row1_01 = vmovl_low_s8(row1);
  1870. int16x8_t row1_23 = vmovl_high_s8(row1);
  1871. int16x4_t row1_0 = vget_low_s16(row1_01);
  1872. int16x4_t row1_1 = vget_high_s16(row1_01);
  1873. int16x4_t row1_2 = vget_low_s16(row1_23);
  1874. int16x4_t row1_3 = vget_high_s16(row1_23);
  1875. vst1_s16(outptr, row0_0);
  1876. vst1_s16(outptr + 1 * 4, row1_0);
  1877. vst1_s16(outptr + 2 * 4, row0_1);
  1878. vst1_s16(outptr + 3 * 4, row1_1);
  1879. vst1_s16(outptr + 4 * 4, row0_2);
  1880. vst1_s16(outptr + 5 * 4, row1_2);
  1881. vst1_s16(outptr + 6 * 4, row0_3);
  1882. vst1_s16(outptr + 7 * 4, row1_3);
  1883. };
  1884. static inline void memcpy_s8_s16(const int8_t* inptr, int16_t* outptr, int count) {
  1885. for (; count >= 32; count -= 32) {
  1886. int8x8_t in0 = vld1_s8(inptr);
  1887. int8x8_t in1 = vld1_s8(inptr + 1 * 8);
  1888. int8x8_t in2 = vld1_s8(inptr + 2 * 8);
  1889. int8x8_t in3 = vld1_s8(inptr + 3 * 8);
  1890. vst1q_s16(outptr, vmovl_s8(in0));
  1891. vst1q_s16(outptr + 1 * 8, vmovl_s8(in1));
  1892. vst1q_s16(outptr + 2 * 8, vmovl_s8(in2));
  1893. vst1q_s16(outptr + 3 * 8, vmovl_s8(in3));
  1894. inptr += 32;
  1895. outptr += 32;
  1896. }
  1897. for (; count >= 8; count -= 8) {
  1898. int8x8_t in0 = vld1_s8(inptr);
  1899. vst1q_s16(outptr, vmovl_s8(in0));
  1900. inptr += 8;
  1901. outptr += 8;
  1902. }
  1903. for (; count > 0; --count) {
  1904. *outptr++ = (int16_t)(*inptr++);
  1905. }
  1906. }
  1907. static inline void transpos_12x4_s8(const int8_t* inptr0, int8_t* outptr) {
  1908. static const uint8_t src_idx_buffer[16] = {0, 4, 8, 12, 1, 5, 9, 13,
  1909. 2, 6, 10, 14, 3, 7, 11, 15};
  1910. static const uint8x16_t vtbl = vld1q_u8(&src_idx_buffer[0]);
  1911. int8x8x4_t input = vld4_s8(inptr0);
  1912. int8x16_t input2 = vqtbl1q_s8(vld1q_s8(inptr0 + 4 * 8), vtbl);
  1913. vst1_s8(outptr, input.val[0]);
  1914. vst1q_lane_s32(
  1915. reinterpret_cast<int32_t*>(outptr + 8), vreinterpretq_s32_s8(input2), 0);
  1916. vst1_s8(outptr + 1 * 12, input.val[1]);
  1917. vst1q_lane_s32(
  1918. reinterpret_cast<int32_t*>(outptr + 1 * 12 + 8),
  1919. vreinterpretq_s32_s8(input2), 1);
  1920. vst1_s8(outptr + 2 * 12, input.val[2]);
  1921. vst1q_lane_s32(
  1922. reinterpret_cast<int32_t*>(outptr + 2 * 12 + 8),
  1923. vreinterpretq_s32_s8(input2), 2);
  1924. vst1_s8(outptr + 3 * 12, input.val[3]);
  1925. vst1q_lane_s32(
  1926. reinterpret_cast<int32_t*>(outptr + 3 * 12 + 8),
  1927. vreinterpretq_s32_s8(input2), 3);
  1928. }
  1929. template <typename T>
  1930. static inline void interleave_8x8_mk4_b(
  1931. const T*& inptr0, const T*& inptr1, T*& outptr) {
  1932. static_assert(
  1933. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1934. "transpose_8x4_1_b only support uint8_t and int8_t");
  1935. asm volatile(
  1936. "ld1 {v0.4s}, [%[inptr0]], #16\n"
  1937. "ld1 {v1.4s}, [%[inptr1]], #16\n"
  1938. "ld1 {v2.4s}, [%[inptr0]], #16\n"
  1939. "ld1 {v3.4s}, [%[inptr1]], #16\n"
  1940. "zip1 v4.4s, v0.4s, v1.4s \n"
  1941. "zip2 v5.4s, v0.4s, v1.4s \n"
  1942. "zip1 v6.4s, v2.4s, v3.4s\n"
  1943. "zip2 v7.4s, v2.4s, v3.4s\n"
  1944. "st1 {v4.4s},[%[outptr]],#16\n"
  1945. "st1 {v5.4s},[%[outptr]],#16\n"
  1946. "st1 {v6.4s},[%[outptr]],#16\n"
  1947. "st1 {v7.4s},[%[outptr]],#16\n"
  1948. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
  1949. :
  1950. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  1951. }
  1952. template <typename T>
  1953. static inline void transpose_8x8_mk4_b(const T*& inptr0, const T*& inptr1, T* outptr) {
  1954. static_assert(
  1955. std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value,
  1956. "transpose_8x4_1_b only support uint8_t and int8_t");
  1957. asm volatile(
  1958. "ld4 {v0.8b-v3.8b}, [%[inptr0]], #32\n"
  1959. "ld4 {v4.8b-v7.8b}, [%[inptr1]], #32\n"
  1960. "st1 {v0.2s},[%[outptr]],#8\n"
  1961. "st1 {v1.2s},[%[outptr]],#8\n"
  1962. "st1 {v2.2s},[%[outptr]],#8\n"
  1963. "st1 {v3.2s},[%[outptr]],#8\n"
  1964. "st1 {v4.2s},[%[outptr]],#8\n"
  1965. "st1 {v5.2s},[%[outptr]],#8\n"
  1966. "st1 {v6.2s},[%[outptr]],#8\n"
  1967. "st1 {v7.2s},[%[outptr]],#8\n"
  1968. : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [outptr] "+r"(outptr)
  1969. :
  1970. : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory");
  1971. }
  1972. } // namespace aarch64
  1973. } // namespace megdnn
  1974. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台