You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

opr_impl.cpp 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. /**
  2. * \file dnn/src/aarch64/relayout/opr_impl.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/common/relayout_helper.h"
  12. #include "src/common/utils.h"
  13. #include "src/aarch64/handle.h"
  14. #include "src/aarch64/relayout/opr_impl.h"
  15. #include "src/arm_common/simd_macro/marm_neon.h"
  16. using namespace megdnn;
  17. using namespace relayout;
  18. namespace {
  19. struct TransposeByte {
  20. uint8_t v;
  21. };
  22. void trans_16x16_u8(
  23. const void* src, void* dst, const size_t src_step, const size_t dst_step) {
  24. asm volatile(
  25. "\n"
  26. "ld1 {v0.16b}, [%[src]], %[src_step] \n"
  27. "ld1 {v1.16b}, [%[src]], %[src_step] \n"
  28. "ld1 {v2.16b}, [%[src]], %[src_step] \n"
  29. "ld1 {v3.16b}, [%[src]], %[src_step] \n"
  30. "ld1 {v4.16b}, [%[src]], %[src_step] \n"
  31. "ld1 {v5.16b}, [%[src]], %[src_step] \n"
  32. "ld1 {v6.16b}, [%[src]], %[src_step] \n"
  33. "ld1 {v7.16b}, [%[src]], %[src_step] \n"
  34. "ld1 {v8.16b}, [%[src]], %[src_step] \n"
  35. "ld1 {v9.16b}, [%[src]], %[src_step] \n"
  36. "ld1 {v10.16b}, [%[src]], %[src_step] \n"
  37. "ld1 {v11.16b}, [%[src]], %[src_step] \n"
  38. "ld1 {v12.16b}, [%[src]], %[src_step] \n"
  39. "ld1 {v13.16b}, [%[src]], %[src_step] \n"
  40. "ld1 {v14.16b}, [%[src]], %[src_step] \n"
  41. "ld1 {v15.16b}, [%[src]], %[src_step] \n"
  42. "trn1 v16.16b, v0.16b, v1.16b \n"
  43. "trn2 v17.16b, v0.16b, v1.16b \n"
  44. "trn1 v18.16b, v2.16b, v3.16b \n"
  45. "trn2 v19.16b, v2.16b, v3.16b \n"
  46. "trn1 v20.16b, v4.16b, v5.16b \n"
  47. "trn2 v21.16b, v4.16b, v5.16b \n"
  48. "trn1 v22.16b, v6.16b, v7.16b \n"
  49. "trn2 v23.16b, v6.16b, v7.16b \n"
  50. "trn1 v24.16b, v8.16b, v9.16b \n"
  51. "trn2 v25.16b, v8.16b, v9.16b \n"
  52. "trn1 v26.16b, v10.16b, v11.16b \n"
  53. "trn2 v27.16b, v10.16b, v11.16b \n"
  54. "trn1 v28.16b, v12.16b, v13.16b \n"
  55. "trn2 v29.16b, v12.16b, v13.16b \n"
  56. "trn1 v30.16b, v14.16b, v15.16b \n"
  57. "trn2 v31.16b, v14.16b, v15.16b \n"
  58. "trn1 v0.8h, v16.8h, v18.8h \n"
  59. "trn2 v2.8h, v16.8h, v18.8h \n"
  60. "trn1 v4.8h, v20.8h, v22.8h \n"
  61. "trn2 v6.8h, v20.8h, v22.8h \n"
  62. "trn1 v8.8h, v24.8h, v26.8h \n"
  63. "trn2 v10.8h, v24.8h, v26.8h \n"
  64. "trn1 v12.8h, v28.8h, v30.8h \n"
  65. "trn2 v14.8h, v28.8h, v30.8h \n"
  66. "trn1 v1.8h, v17.8h, v19.8h \n"
  67. "trn2 v3.8h, v17.8h, v19.8h \n"
  68. "trn1 v5.8h, v21.8h, v23.8h \n"
  69. "trn2 v7.8h, v21.8h, v23.8h \n"
  70. "trn1 v9.8h, v25.8h, v27.8h \n"
  71. "trn2 v11.8h, v25.8h, v27.8h \n"
  72. "trn1 v13.8h, v29.8h, v31.8h \n"
  73. "trn2 v15.8h, v29.8h, v31.8h \n"
  74. "trn1 v16.4s, v0.4s, v4.4s \n"
  75. "trn2 v20.4s, v0.4s, v4.4s \n"
  76. "trn1 v24.4s, v8.4s, v12.4s \n"
  77. "trn2 v28.4s, v8.4s, v12.4s \n"
  78. "trn1 v17.4s, v1.4s, v5.4s \n"
  79. "trn2 v21.4s, v1.4s, v5.4s \n"
  80. "trn1 v25.4s, v9.4s, v13.4s \n"
  81. "trn2 v29.4s, v9.4s, v13.4s \n"
  82. "trn1 v18.4s, v2.4s, v6.4s \n"
  83. "trn2 v22.4s, v2.4s, v6.4s \n"
  84. "trn1 v26.4s, v10.4s, v14.4s \n"
  85. "trn2 v30.4s, v10.4s, v14.4s \n"
  86. "trn1 v19.4s, v3.4s, v7.4s \n"
  87. "trn2 v23.4s, v3.4s, v7.4s \n"
  88. "trn1 v27.4s, v11.4s, v15.4s \n"
  89. "trn2 v31.4s, v11.4s, v15.4s \n"
  90. "trn1 v0.2d, v16.2d, v24.2d \n"
  91. "trn2 v8.2d, v16.2d, v24.2d \n"
  92. "trn1 v1.2d, v17.2d, v25.2d \n"
  93. "trn2 v9.2d, v17.2d, v25.2d \n"
  94. "trn1 v2.2d, v18.2d, v26.2d \n"
  95. "trn2 v10.2d, v18.2d, v26.2d \n"
  96. "trn1 v3.2d, v19.2d, v27.2d \n"
  97. "trn2 v11.2d, v19.2d, v27.2d \n"
  98. "trn1 v4.2d, v20.2d, v28.2d \n"
  99. "trn2 v12.2d, v20.2d, v28.2d \n"
  100. "trn1 v5.2d, v21.2d, v29.2d \n"
  101. "trn2 v13.2d, v21.2d, v29.2d \n"
  102. "trn1 v6.2d, v22.2d, v30.2d \n"
  103. "trn2 v14.2d, v22.2d, v30.2d \n"
  104. "trn1 v7.2d, v23.2d, v31.2d \n"
  105. "trn2 v15.2d, v23.2d, v31.2d \n"
  106. "st1 {v0.16b}, [%[dst]], %[dst_step] \n"
  107. "st1 {v1.16b}, [%[dst]], %[dst_step] \n"
  108. "st1 {v2.16b}, [%[dst]], %[dst_step] \n"
  109. "st1 {v3.16b}, [%[dst]], %[dst_step] \n"
  110. "st1 {v4.16b}, [%[dst]], %[dst_step] \n"
  111. "st1 {v5.16b}, [%[dst]], %[dst_step] \n"
  112. "st1 {v6.16b}, [%[dst]], %[dst_step] \n"
  113. "st1 {v7.16b}, [%[dst]], %[dst_step] \n"
  114. "st1 {v8.16b}, [%[dst]], %[dst_step] \n"
  115. "st1 {v9.16b}, [%[dst]], %[dst_step] \n"
  116. "st1 {v10.16b}, [%[dst]], %[dst_step] \n"
  117. "st1 {v11.16b}, [%[dst]], %[dst_step] \n"
  118. "st1 {v12.16b}, [%[dst]], %[dst_step] \n"
  119. "st1 {v13.16b}, [%[dst]], %[dst_step] \n"
  120. "st1 {v14.16b}, [%[dst]], %[dst_step] \n"
  121. "st1 {v15.16b}, [%[dst]], %[dst_step] \n"
  122. : [src] "+r"(src), [dst] "+r"(dst)
  123. : [src_step] "r"(src_step), [dst_step] "r"(dst_step)
  124. : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "d11",
  125. "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21",
  126. "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31");
  127. }
  128. struct Transpose4Byte {
  129. uint32_t v;
  130. };
  131. static inline void trans_8x8_u32(
  132. const void* src, void* dst, const size_t src_step, const size_t dst_step) {
  133. uint32_t* src_ptr = (uint32_t*)src;
  134. uint32_t* dst_ptr = (uint32_t*)dst;
  135. uint32x4x2_t src0 = vld1q_u32_x2(src_ptr + 0 * src_step); // A0A1A2A3
  136. uint32x4x2_t src1 = vld1q_u32_x2(src_ptr + 1 * src_step); // B0B1B2B3
  137. uint32x4x2_t src2 = vld1q_u32_x2(src_ptr + 2 * src_step); // C0C1C2C3
  138. uint32x4x2_t src3 = vld1q_u32_x2(src_ptr + 3 * src_step); // D0D1D2D3
  139. uint32x4x2_t src4 = vld1q_u32_x2(src_ptr + 4 * src_step); // E0E1E2E3
  140. uint32x4x2_t src5 = vld1q_u32_x2(src_ptr + 5 * src_step); // F0F1F2F3
  141. uint32x4x2_t src6 = vld1q_u32_x2(src_ptr + 6 * src_step); // G0G1G2G3
  142. uint32x4x2_t src7 = vld1q_u32_x2(src_ptr + 7 * src_step); // H0H1H2H3
  143. uint32x4_t ab_low = vzip1q_u32(src0.val[0], src1.val[0]); // A0B0A1B1
  144. uint32x4_t ab_high = vzip2q_u32(src0.val[0], src1.val[0]); // A2B2A3B3
  145. uint32x4_t cd_low = vzip1q_u32(src2.val[0], src3.val[0]); // C0D0C1D1
  146. uint32x4_t cd_high = vzip2q_u32(src2.val[0], src3.val[0]); // C2D2C3D3
  147. uint32x4_t ef_low = vzip1q_u32(src4.val[0], src5.val[0]); // E0F0E1F1
  148. uint32x4_t ef_high = vzip2q_u32(src4.val[0], src5.val[0]); // E2F2E3F3
  149. uint32x4_t gh_low = vzip1q_u32(src6.val[0], src7.val[0]); // G0H0G1H1
  150. uint32x4_t gh_high = vzip2q_u32(src6.val[0], src7.val[0]); // G2H2G3H3
  151. uint32x4_t abcd_0 = vreinterpretq_u32_u64(vzip1q_u64(
  152. vreinterpretq_u64_u32(ab_low), vreinterpretq_u64_u32(cd_low))); // A0B0C0D0
  153. uint32x4_t abcd_1 = vreinterpretq_u32_u64(vzip2q_u64(
  154. vreinterpretq_u64_u32(ab_low), vreinterpretq_u64_u32(cd_low))); // A1B1C1D1
  155. uint32x4_t abcd_2 = vreinterpretq_u32_u64(vzip1q_u64(
  156. vreinterpretq_u64_u32(ab_high),
  157. vreinterpretq_u64_u32(cd_high))); // A2B2C2D2
  158. uint32x4_t abcd_3 = vreinterpretq_u32_u64(vzip2q_u64(
  159. vreinterpretq_u64_u32(ab_high),
  160. vreinterpretq_u64_u32(cd_high))); // A3B3C3D3
  161. uint32x4_t efgh_0 = vreinterpretq_u32_u64(vzip1q_u64(
  162. vreinterpretq_u64_u32(ef_low), vreinterpretq_u64_u32(gh_low))); // E0F0G0H0
  163. uint32x4_t efgh_1 = vreinterpretq_u32_u64(vzip2q_u64(
  164. vreinterpretq_u64_u32(ef_low), vreinterpretq_u64_u32(gh_low))); // E1F1G1H1
  165. uint32x4_t efgh_2 = vreinterpretq_u32_u64(vzip1q_u64(
  166. vreinterpretq_u64_u32(ef_high),
  167. vreinterpretq_u64_u32(gh_high))); // E2F2G2H2
  168. uint32x4_t efgh_3 = vreinterpretq_u32_u64(vzip2q_u64(
  169. vreinterpretq_u64_u32(ef_high),
  170. vreinterpretq_u64_u32(gh_high))); // E3F3G3H3
  171. vst1q_u32(dst_ptr + 0 * dst_step, abcd_0);
  172. vst1q_u32(dst_ptr + 0 * dst_step + 4, efgh_0);
  173. vst1q_u32(dst_ptr + 1 * dst_step, abcd_1);
  174. vst1q_u32(dst_ptr + 1 * dst_step + 4, efgh_1);
  175. vst1q_u32(dst_ptr + 2 * dst_step, abcd_2);
  176. vst1q_u32(dst_ptr + 2 * dst_step + 4, efgh_2);
  177. vst1q_u32(dst_ptr + 3 * dst_step, abcd_3);
  178. vst1q_u32(dst_ptr + 3 * dst_step + 4, efgh_3);
  179. ab_low = vzip1q_u32(src0.val[1], src1.val[1]); // A0B0A1B1
  180. ab_high = vzip2q_u32(src0.val[1], src1.val[1]); // A2B2A3B3
  181. cd_low = vzip1q_u32(src2.val[1], src3.val[1]); // C0D0C1D1
  182. cd_high = vzip2q_u32(src2.val[1], src3.val[1]); // C2D2C3D3
  183. ef_low = vzip1q_u32(src4.val[1], src5.val[1]); // E0F0E1F1
  184. ef_high = vzip2q_u32(src4.val[1], src5.val[1]); // E2F2E3F3
  185. gh_low = vzip1q_u32(src6.val[1], src7.val[1]); // G0H0G1H1
  186. gh_high = vzip2q_u32(src6.val[1], src7.val[1]); // G2H2G3H3
  187. abcd_0 = vreinterpretq_u32_u64(vzip1q_u64(
  188. vreinterpretq_u64_u32(ab_low), vreinterpretq_u64_u32(cd_low))); // A0B0C0D0
  189. abcd_1 = vreinterpretq_u32_u64(vzip2q_u64(
  190. vreinterpretq_u64_u32(ab_low), vreinterpretq_u64_u32(cd_low))); // A1B1C1D1
  191. abcd_2 = vreinterpretq_u32_u64(vzip1q_u64(
  192. vreinterpretq_u64_u32(ab_high),
  193. vreinterpretq_u64_u32(cd_high))); // A2B2C2D2
  194. abcd_3 = vreinterpretq_u32_u64(vzip2q_u64(
  195. vreinterpretq_u64_u32(ab_high),
  196. vreinterpretq_u64_u32(cd_high))); // A3B3C3D3
  197. efgh_0 = vreinterpretq_u32_u64(vzip1q_u64(
  198. vreinterpretq_u64_u32(ef_low), vreinterpretq_u64_u32(gh_low))); // E0F0G0H0
  199. efgh_1 = vreinterpretq_u32_u64(vzip2q_u64(
  200. vreinterpretq_u64_u32(ef_low), vreinterpretq_u64_u32(gh_low))); // E1F1G1H1
  201. efgh_2 = vreinterpretq_u32_u64(vzip1q_u64(
  202. vreinterpretq_u64_u32(ef_high),
  203. vreinterpretq_u64_u32(gh_high))); // E2F2G2H2
  204. efgh_3 = vreinterpretq_u32_u64(vzip2q_u64(
  205. vreinterpretq_u64_u32(ef_high),
  206. vreinterpretq_u64_u32(gh_high))); // E3F3G3H3
  207. vst1q_u32(dst_ptr + 4 * dst_step, abcd_0);
  208. vst1q_u32(dst_ptr + 4 * dst_step + 4, efgh_0);
  209. vst1q_u32(dst_ptr + 5 * dst_step, abcd_1);
  210. vst1q_u32(dst_ptr + 5 * dst_step + 4, efgh_1);
  211. vst1q_u32(dst_ptr + 6 * dst_step, abcd_2);
  212. vst1q_u32(dst_ptr + 6 * dst_step + 4, efgh_2);
  213. vst1q_u32(dst_ptr + 7 * dst_step, abcd_3);
  214. vst1q_u32(dst_ptr + 7 * dst_step + 4, efgh_3);
  215. }
  216. struct Transpose2Byte {
  217. uint16_t v;
  218. };
  219. static inline void trans_8x8_u16(
  220. const void* src, void* dst, const size_t src_step, const size_t dst_step) {
  221. uint16_t* src_ptr = (uint16_t*)src;
  222. uint16_t* dst_ptr = (uint16_t*)dst;
  223. uint16x8_t src0 = vld1q_u16(src_ptr + 0 * src_step); // A0A1A2A3A4A5A6A7
  224. uint16x8_t src1 = vld1q_u16(src_ptr + 1 * src_step); // B0B1B2B3B4B5B6B7
  225. uint16x8_t src2 = vld1q_u16(src_ptr + 2 * src_step); // C0C1C2C3C4C5C6C7
  226. uint16x8_t src3 = vld1q_u16(src_ptr + 3 * src_step); // D0D1D2D3D4D5D6D7
  227. uint16x8_t src4 = vld1q_u16(src_ptr + 4 * src_step); // E0E1E2E3E4E5E6E7
  228. uint16x8_t src5 = vld1q_u16(src_ptr + 5 * src_step); // F0F1F2F3F4F5F6F7
  229. uint16x8_t src6 = vld1q_u16(src_ptr + 6 * src_step); // G0G1G2G3G4G5G6G7
  230. uint16x8_t src7 = vld1q_u16(src_ptr + 7 * src_step); // H0H1H2H3H4H5H6H7
  231. uint16x8_t ab_low = vzip1q_u16(src0, src1); // A0B0A1B1A2B2A3B3
  232. uint16x8_t ab_high = vzip2q_u16(src0, src1); // A4B4A5B5A6B6A7B7
  233. uint16x8_t cd_low = vzip1q_u16(src2, src3); // C0D0C1D1C2D2C3D3
  234. uint16x8_t cd_high = vzip2q_u16(src2, src3); // C4D4C5D5C6D6C7D7
  235. uint16x8_t ef_low = vzip1q_u16(src4, src5); // E0F0E1F1E2F2E3F3
  236. uint16x8_t ef_high = vzip2q_u16(src4, src5); // E4F4E5F5E6F6E7F7
  237. uint16x8_t gh_low = vzip1q_u16(src6, src7); // G0H0G1H1G2H2G3H3
  238. uint16x8_t gh_high = vzip2q_u16(src6, src7); // G4H4G5H5G6H6G7H7
  239. uint16x8_t abcd_0 = vreinterpretq_u16_u32(vzip1q_u32(
  240. vreinterpretq_u32_u16(ab_low),
  241. vreinterpretq_u32_u16(cd_low))); // A0B0C0D0A1B1C1D1
  242. uint16x8_t abcd_2 = vreinterpretq_u16_u32(vzip2q_u32(
  243. vreinterpretq_u32_u16(ab_low),
  244. vreinterpretq_u32_u16(cd_low))); // A2B2C2D2A3B3C3D3
  245. uint16x8_t abcd_4 = vreinterpretq_u16_u32(vzip1q_u32(
  246. vreinterpretq_u32_u16(ab_high),
  247. vreinterpretq_u32_u16(cd_high))); // A4B4C4D4A5B5C5D5
  248. uint16x8_t abcd_6 = vreinterpretq_u16_u32(vzip2q_u32(
  249. vreinterpretq_u32_u16(ab_high),
  250. vreinterpretq_u32_u16(cd_high))); // A6B6C6D6A7B7C7D7
  251. uint16x8_t efgh_0 = vreinterpretq_u16_u32(vzip1q_u32(
  252. vreinterpretq_u32_u16(ef_low),
  253. vreinterpretq_u32_u16(gh_low))); // E0F0G0H0E1F1G1H1
  254. uint16x8_t efgh_2 = vreinterpretq_u16_u32(vzip2q_u32(
  255. vreinterpretq_u32_u16(ef_low),
  256. vreinterpretq_u32_u16(gh_low))); // E2F2G2H2E3F3G3H3
  257. uint16x8_t efgh_4 = vreinterpretq_u16_u32(vzip1q_u32(
  258. vreinterpretq_u32_u16(ef_high),
  259. vreinterpretq_u32_u16(gh_high))); // E4F4G4H4E5F5G5H5
  260. uint16x8_t efgh_6 = vreinterpretq_u16_u32(vzip2q_u32(
  261. vreinterpretq_u32_u16(ef_high),
  262. vreinterpretq_u32_u16(gh_high))); // E6F6G6H6E7F7G7H7
  263. uint16x8_t row_0 = vreinterpretq_u16_u64(
  264. vzip1q_u64(vreinterpretq_u64_u16(abcd_0), vreinterpretq_u64_u16(efgh_0)));
  265. uint16x8_t row_1 = vreinterpretq_u16_u64(
  266. vzip2q_u64(vreinterpretq_u64_u16(abcd_0), vreinterpretq_u64_u16(efgh_0)));
  267. uint16x8_t row_2 = vreinterpretq_u16_u64(
  268. vzip1q_u64(vreinterpretq_u64_u16(abcd_2), vreinterpretq_u64_u16(efgh_2)));
  269. uint16x8_t row_3 = vreinterpretq_u16_u64(
  270. vzip2q_u64(vreinterpretq_u64_u16(abcd_2), vreinterpretq_u64_u16(efgh_2)));
  271. uint16x8_t row_4 = vreinterpretq_u16_u64(
  272. vzip1q_u64(vreinterpretq_u64_u16(abcd_4), vreinterpretq_u64_u16(efgh_4)));
  273. uint16x8_t row_5 = vreinterpretq_u16_u64(
  274. vzip2q_u64(vreinterpretq_u64_u16(abcd_4), vreinterpretq_u64_u16(efgh_4)));
  275. uint16x8_t row_6 = vreinterpretq_u16_u64(
  276. vzip1q_u64(vreinterpretq_u64_u16(abcd_6), vreinterpretq_u64_u16(efgh_6)));
  277. uint16x8_t row_7 = vreinterpretq_u16_u64(
  278. vzip2q_u64(vreinterpretq_u64_u16(abcd_6), vreinterpretq_u64_u16(efgh_6)));
  279. vst1q_u16(dst_ptr + 0 * dst_step, row_0);
  280. vst1q_u16(dst_ptr + 1 * dst_step, row_1);
  281. vst1q_u16(dst_ptr + 2 * dst_step, row_2);
  282. vst1q_u16(dst_ptr + 3 * dst_step, row_3);
  283. vst1q_u16(dst_ptr + 4 * dst_step, row_4);
  284. vst1q_u16(dst_ptr + 5 * dst_step, row_5);
  285. vst1q_u16(dst_ptr + 6 * dst_step, row_6);
  286. vst1q_u16(dst_ptr + 7 * dst_step, row_7);
  287. }
  288. static inline void trans_8x4_u16(
  289. const void* src, void* dst, const size_t src_step, const size_t dst_step) {
  290. uint16_t* src_ptr = (uint16_t*)src;
  291. uint16_t* dst_ptr = (uint16_t*)dst;
  292. uint16x4_t src0 = vld1_u16(src_ptr + 0 * src_step); // A0A1A2A3
  293. uint16x4_t src1 = vld1_u16(src_ptr + 1 * src_step); // B0B1B2B3
  294. uint16x4_t src2 = vld1_u16(src_ptr + 2 * src_step); // C0C1C2C3
  295. uint16x4_t src3 = vld1_u16(src_ptr + 3 * src_step); // D0D1D2D3
  296. uint16x4_t src4 = vld1_u16(src_ptr + 4 * src_step); // E0E1E2E3
  297. uint16x4_t src5 = vld1_u16(src_ptr + 5 * src_step); // F0F1F2F3
  298. uint16x4_t src6 = vld1_u16(src_ptr + 6 * src_step); // G0G1G2G3
  299. uint16x4_t src7 = vld1_u16(src_ptr + 7 * src_step); // H0H1H2H3
  300. uint16x4_t ab_low = vzip1_u16(src0, src1); // A0B0A1B1
  301. uint16x4_t ab_high = vzip2_u16(src0, src1); // A2B2A3B3
  302. uint16x4_t cd_low = vzip1_u16(src2, src3); // C0D0C1D1
  303. uint16x4_t cd_high = vzip2_u16(src2, src3); // C2D2C3D3
  304. uint16x4_t ef_low = vzip1_u16(src4, src5); // E0F0E1F1
  305. uint16x4_t ef_high = vzip2_u16(src4, src5); // E2F2E3F3
  306. uint16x4_t gh_low = vzip1_u16(src6, src7); // G0H0G1H1
  307. uint16x4_t gh_high = vzip2_u16(src6, src7); // G2H2G3H3
  308. uint16x4_t abcd_0 = vreinterpret_u16_u32(vzip1_u32(
  309. vreinterpret_u32_u16(ab_low),
  310. vreinterpret_u32_u16(cd_low))); // A0B0C0D0
  311. uint16x4_t abcd_1 = vreinterpret_u16_u32(vzip2_u32(
  312. vreinterpret_u32_u16(ab_low),
  313. vreinterpret_u32_u16(cd_low))); // A1B1C1D1
  314. uint16x4_t abcd_2 = vreinterpret_u16_u32(vzip1_u32(
  315. vreinterpret_u32_u16(ab_high),
  316. vreinterpret_u32_u16(cd_high))); // A2B2C2D2
  317. uint16x4_t abcd_3 = vreinterpret_u16_u32(vzip2_u32(
  318. vreinterpret_u32_u16(ab_high),
  319. vreinterpret_u32_u16(cd_high))); // A3B3C3D3
  320. uint16x4_t efgh_0 = vreinterpret_u16_u32(vzip1_u32(
  321. vreinterpret_u32_u16(ef_low),
  322. vreinterpret_u32_u16(gh_low))); // E0F0G0H0
  323. uint16x4_t efgh_1 = vreinterpret_u16_u32(vzip2_u32(
  324. vreinterpret_u32_u16(ef_low),
  325. vreinterpret_u32_u16(gh_low))); // E1F1G1H1
  326. uint16x4_t efgh_2 = vreinterpret_u16_u32(vzip1_u32(
  327. vreinterpret_u32_u16(ef_high),
  328. vreinterpret_u32_u16(gh_high))); // E2F2G2H2
  329. uint16x4_t efgh_3 = vreinterpret_u16_u32(vzip2_u32(
  330. vreinterpret_u32_u16(ef_high),
  331. vreinterpret_u32_u16(gh_high))); // E3F3G3H3
  332. uint16x8_t row_0 = vcombine_u16(abcd_0, efgh_0);
  333. uint16x8_t row_1 = vcombine_u16(abcd_1, efgh_1);
  334. uint16x8_t row_2 = vcombine_u16(abcd_2, efgh_2);
  335. uint16x8_t row_3 = vcombine_u16(abcd_3, efgh_3);
  336. vst1q_u16(dst_ptr + 0 * dst_step, row_0);
  337. vst1q_u16(dst_ptr + 1 * dst_step, row_1);
  338. vst1q_u16(dst_ptr + 2 * dst_step, row_2);
  339. vst1q_u16(dst_ptr + 3 * dst_step, row_3);
  340. }
  341. } // anonymous namespace
  342. namespace megdnn {
  343. namespace relayout {
  344. namespace transpose_fallback {
  345. template <>
  346. struct transpose_traits<TransposeByte> {
  347. static constexpr size_t block_size = 16;
  348. };
  349. template <>
  350. void transpose_block<TransposeByte>(
  351. const TransposeByte* src, TransposeByte* dst, const size_t src_stride,
  352. const size_t dst_stride) {
  353. trans_16x16_u8(src, dst, src_stride, dst_stride);
  354. }
  355. template <>
  356. struct transpose_traits<Transpose4Byte> {
  357. static constexpr size_t block_size = 8;
  358. };
  359. template <>
  360. void transpose_block<Transpose4Byte>(
  361. const Transpose4Byte* src, Transpose4Byte* dst, const size_t src_stride,
  362. const size_t dst_stride) {
  363. trans_8x8_u32(src, dst, src_stride, dst_stride);
  364. }
  365. template <>
  366. struct transpose_traits<Transpose2Byte> {
  367. static constexpr size_t block_size = 8;
  368. };
  369. template <>
  370. void transpose_block<Transpose2Byte>(
  371. const Transpose2Byte* src, Transpose2Byte* dst, const size_t src_stride,
  372. const size_t dst_stride) {
  373. trans_8x8_u16(src, dst, src_stride, dst_stride);
  374. }
  375. template <>
  376. void transpose_block<Transpose2Byte>(
  377. const Transpose2Byte* src, Transpose2Byte* dst, const size_t src_stride,
  378. const size_t dst_stride, size_t block_h, size_t block_w) {
  379. if (block_h == 8 && block_w == 4) {
  380. trans_8x4_u16(src, dst, src_stride, dst_stride);
  381. } else {
  382. transpose_block_fallback(src, dst, src_stride, dst_stride, block_h, block_w);
  383. }
  384. }
  385. } // namespace transpose_fallback
  386. } // namespace relayout
  387. } // namespace megdnn
  388. void aarch64::RelayoutForwardImpl::exec(
  389. _megdnn_tensor_in src0, _megdnn_tensor_out dst0, Handle* src_handle) {
  390. check_cpu_handle(src_handle);
  391. TensorND src = src0, dst = dst0;
  392. check_layout_and_canonize(src.layout, dst.layout);
  393. // FIXME: optimize for lowbit cases
  394. if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
  395. src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
  396. fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
  397. return;
  398. }
  399. relayout::TransposeParam trans_param;
  400. bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param, true);
  401. if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {
  402. MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose<TransposeByte>(
  403. trans_param.batch, trans_param.m, trans_param.n,
  404. static_cast<TransposeByte*>(src.raw_ptr()),
  405. static_cast<TransposeByte*>(dst.raw_ptr()), trans_param.stride_m));
  406. return;
  407. } else if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 2) {
  408. MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose<Transpose2Byte>(
  409. trans_param.batch, trans_param.m, trans_param.n,
  410. static_cast<Transpose2Byte*>(src.raw_ptr()),
  411. static_cast<Transpose2Byte*>(dst.raw_ptr()), trans_param.stride_m));
  412. return;
  413. } else if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 4) {
  414. MEGDNN_DISPATCH_CPU_KERN_OPR(transpose_fallback::transpose<Transpose4Byte>(
  415. trans_param.batch, trans_param.m, trans_param.n,
  416. static_cast<Transpose4Byte*>(src.raw_ptr()),
  417. static_cast<Transpose4Byte*>(dst.raw_ptr()), trans_param.stride_m));
  418. return;
  419. }
  420. exec_after_preprocess(src, dst, trans ? &trans_param : nullptr);
  421. }
  422. // vim: syntax=cpp.doxygen